# Data Cleaning
---

In [14]:
# import libraries
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [15]:
# load extracted data

data_entry_df = pd.read_pickle('../data/interim/data_entry_df.pkl')
train_val_list = pd.read_pickle('../data/interim/train_val_list.pkl')
test_list = pd.read_pickle('../data/interim/test_list.pkl')

In [16]:
# removed unnamed column
data_entry_df = data_entry_df.drop(columns=["Unnamed: 11"],axis=1)

# rename columns 
data_entry_df.columns = data_entry_df.columns.tolist() 
data_entry_df.columns = ["image_index", "finding_labels", "follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "original_img_width", "original_img_height", "img_pixel_spacing_x", "img_pixel_spacing_y"]
data_entry_df.head()

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143


In [17]:
# checking basic characteristic of the dataset (na values, null values, duplicated values, and checked data type of each column)

print(f'Checking for NA values: \n {data_entry_df.isna().sum()}')
print(f'Checking for Null values: \n {data_entry_df.isnull().sum()}')
print(f'Checking for duplicated values: \n {data_entry_df.duplicated().sum()}')
print(f'Checking data type of each column: \n {data_entry_df.info()}')

Checking for NA values: 
 image_index            0
finding_labels         0
follow_up_number       0
patient_id             0
patient_age            0
patient_gender         0
view_position          0
original_img_width     0
original_img_height    0
img_pixel_spacing_x    0
img_pixel_spacing_y    0
dtype: int64
Checking for Null values: 
 image_index            0
finding_labels         0
follow_up_number       0
patient_id             0
patient_age            0
patient_gender         0
view_position          0
original_img_width     0
original_img_height    0
img_pixel_spacing_x    0
img_pixel_spacing_y    0
dtype: int64
Checking for duplicated values: 
 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   image_index          112120 non-null  object 
 1   finding_labels       112120 non-null  object 
 2   follow_up_numb

In [18]:
# issue: missing column headers in both test_list and train_val_list datasets
# added a 'image_index' as the header for both datasets

shifted_train_val = train_val_list.shift(1)
shifted_train_val.columns = ['image_index']
shifted_train_val.iloc[0,0] = train_val_list.columns[0]
print(shifted_train_val.head())

shifted_test_list = test_list.shift(1)
shifted_test_list.columns = ['image_index']
shifted_test_list.iloc[0,0] = test_list.columns[0]
print(shifted_test_list.head())

        image_index
0  00000001_000.png
1  00000001_001.png
2  00000001_002.png
3  00000002_000.png
4  00000004_000.png
        image_index
0  00000003_000.png
1  00000003_001.png
2  00000003_002.png
3  00000003_003.png
4  00000003_004.png


In [19]:
# checked and removed age outliers (patients with ages over 120)
over_120 = data_entry_df[data_entry_df["patient_age"]>120]

#total_num_patients = data_entry_df["patient_age"].shape[0]
print(f'Number of patients over 120 years old: {len(over_120)} out of {data_entry_df["patient_age"].shape[0]}')

for index in over_120.index:
    data_entry_df.drop(index=[index], inplace=True)

# double check after removing outliers
print(f'Number of patients over 120 years old after removal: {len(data_entry_df[data_entry_df["patient_age"]>120])} out of {data_entry_df["patient_age"].shape[0]}')

Number of patients over 120 years old: 16 out of 112120
Number of patients over 120 years old after removal: 0 out of 112104


In [20]:
# the data_entry dataframe is structured per image. As a result, there are multiple x-ray images that belong to the same patient from follow-ups
# created a seperate patient-level dataset for EDA 

patient_data = data_entry_df.groupby(["patient_id", "follow_up_number"]).agg({
    "image_index": "first",
    "finding_labels": "first",
    "patient_age": "first",
    "patient_gender": "first",
    "view_position": "first"
})
patient_data.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,image_index,finding_labels,patient_age,patient_gender,view_position
patient_id,follow_up_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,00000001_000.png,Cardiomegaly,58,M,PA
1,1,00000001_001.png,Cardiomegaly|Emphysema,58,M,PA
1,2,00000001_002.png,Cardiomegaly|Effusion,58,M,PA
2,0,00000002_000.png,No Finding,81,M,PA
3,0,00000003_000.png,Hernia,81,F,PA
3,1,00000003_001.png,Hernia,74,F,PA
3,2,00000003_002.png,Hernia,75,F,PA
3,3,00000003_003.png,Hernia|Infiltration,76,F,PA
3,4,00000003_004.png,Hernia,77,F,PA
3,5,00000003_005.png,Hernia,78,F,PA


In [21]:
# Save to .pkl files

data_entry_df.to_pickle('../data/interim/data_entry_df.pkl')
patient_data.to_pickle('../data/interim/patient_data.pkl')
shifted_train_val.to_pickle('../data/interim/train_val_list.pkl')
shifted_test_list.to_pickle('../data/interim/test_list.pkl')