# Data Cleaning
---

In [67]:
# import libraries
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [68]:
# load extracted data
bbox_df = pd.read_pickle('../data/interim/bbox_df.pkl')
data_entry_df = pd.read_pickle('../data/interim/data_entry_df.pkl')

In [69]:
# removed unnesscary columns and reformated column names
bbox_df = bbox_df.drop(["Unnamed: 6", "Unnamed: 7", "Unnamed: 8"], axis=1) # removed NaN columns

# reformate columns (lowercase, no spaces)
bbox_df.columns = bbox_df.columns.tolist()

# renamed bbox coordinate columns
bbox_df.columns = ["image_index", "finding_label","x_min", "y_min", "x_max", "y_max"]
bbox_df.head()

Unnamed: 0,image_index,finding_label,x_min,y_min,x_max,y_max
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695


In [70]:
# checking basic characteristic of the dataset (na values, null values, duplicated values, and checked data type of each column)

print(f'Checking for NA values: \n {bbox_df.isna().sum()}')
print(f'Checking for Null values: \n {bbox_df.isnull().sum()}')
print(f'Checking for duplicated values: \n {bbox_df.duplicated().sum()}')
print(f'Checking data type of each column: \n {bbox_df.info()}')

Checking for NA values: 
 image_index      0
finding_label    0
x_min            0
y_min            0
x_max            0
y_max            0
dtype: int64
Checking for Null values: 
 image_index      0
finding_label    0
x_min            0
y_min            0
x_max            0
y_max            0
dtype: int64
Checking for duplicated values: 
 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   image_index    984 non-null    object 
 1   finding_label  984 non-null    object 
 2   x_min          984 non-null    float64
 3   y_min          984 non-null    float64
 4   x_max          984 non-null    float64
 5   y_max          984 non-null    float64
dtypes: float64(4), object(2)
memory usage: 46.3+ KB
Checking data type of each column: 
 None


In [71]:
# removed unnamed column
data_entry_df = data_entry_df.drop(columns=["Unnamed: 11"],axis=1)

# rename columns 
data_entry_df.columns = data_entry_df.columns.tolist() 
data_entry_df.columns = ["image_index", "finding_labels", "follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "original_img_width", "original_img_height", "img_pixel_spacing_x", "img_pixel_spacing_y"]
data_entry_df.head()

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143


In [72]:
# checking basic characteristic of the dataset (na values, null values, duplicated values, and checked data type of each column)

print(f'Checking for NA values: \n {data_entry_df.isna().sum()}')
print(f'Checking for Null values: \n {data_entry_df.isnull().sum()}')
print(f'Checking for duplicated values: \n {data_entry_df.duplicated().sum()}')
print(f'Checking data type of each column: \n {data_entry_df.info()}')

Checking for NA values: 
 image_index            0
finding_labels         0
follow_up_number       0
patient_id             0
patient_age            0
patient_gender         0
view_position          0
original_img_width     0
original_img_height    0
img_pixel_spacing_x    0
img_pixel_spacing_y    0
dtype: int64
Checking for Null values: 
 image_index            0
finding_labels         0
follow_up_number       0
patient_id             0
patient_age            0
patient_gender         0
view_position          0
original_img_width     0
original_img_height    0
img_pixel_spacing_x    0
img_pixel_spacing_y    0
dtype: int64
Checking for duplicated values: 
 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   image_index          112120 non-null  object 
 1   finding_labels       112120 non-null  object 
 2   follow_up_numb

In [73]:
# checked and removed age outliers (patients with ages over 120)
over_120 = data_entry_df[data_entry_df["patient_age"]>120]
total_num_patients = data_entry_df["patient_age"].shape[0]
print(f'Number of patients over 120 years old: {len(over_120)} out of {total_num_patients}')

for index in over_120.index:
    data_entry_df.drop(index=[index], inplace=True)

# double check after removing outliers
print(f'Number of patients over 120 years old after removal: {len(data_entry_df[data_entry_df["patient_age"]>120])} out of {total_num_patients}')

Number of patients over 120 years old: 16 out of 112120
Number of patients over 120 years old after removal: 0 out of 112120


In [74]:
# Save to .pkl files
bbox_df.to_pickle('../data/interim/bbox_df.pkl')
data_entry_df.to_pickle('../data/interim/data_entry_df.pkl')