# Data Preprocessing and Splitting
---

In [127]:
# import libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../scripts')
from create_labels import column_order, labels, create_labels

In [60]:
# load data

bbox_df = pd.read_pickle('../data/interim/bbox_df.pkl')
data_entry_df = pd.read_pickle('../data/interim/data_entry_df.pkl')
train_val_list = pd.read_pickle('../data/interim/train_val_list.pkl')
test_list = pd.read_pickle('../data/interim/test_list.pkl')
patient_data = pd.read_pickle('../data/interim/patient_data.pkl')

In [None]:
# show unique classes
labels_exploded = data_entry_df["finding_labels"].str.split('|').explode()
sorted(labels_exploded.unique())

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

## Create Labels

In [121]:
# inner joined data between data_entry_df and both test and training datasets
# created a new filtered dataset where it includes data from both datasets only where 'image_index' matches both datasets

filtered_test_list = data_entry_df.merge(test_list, on='image_index', how='inner')
filtered_train_val_list = data_entry_df.merge(train_val_list, on='image_index', how='inner')

display(filtered_test_list.head())
display(filtered_train_val_list.head())

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
1,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168
2,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168
3,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143
4,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168


Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168


In [94]:
# encode target variable classes (train_val_list)
train_X = filtered_train_val_list.drop("finding_labels", axis=1)
train_y = filtered_train_val_list["finding_labels"]

train_val_labels = create_labels(train_X, train_y)
train_val_labels.head()

Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
0,00000001_000.png,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,58,M,PA
1,00000001_001.png,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,1,58,M,PA
2,00000001_002.png,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,2,1,58,M,PA
3,00000002_000.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,81,M,PA
4,00000004_000.png,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4,82,M,AP


In [123]:
# encode target variable classes (test_list)
test_X = filtered_test_list.drop("finding_labels", axis=1)
test_y = filtered_test_list["finding_labels"]

test_labels = create_labels(test_X,test_y)
test_labels.head()

Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
0,00000003_000.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,3,81,F,PA
1,00000003_001.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,3,74,F,PA
2,00000003_002.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,2,3,75,F,PA
3,00000003_003.png,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,3,3,76,F,PA
4,00000003_004.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,4,3,77,F,PA


In [None]:
# performed train_test_split on the train_val_list dataset to obtain 80/20 split (80% - train, 20% validation)
train_val_X = train_val_labels.drop(labels, axis=1)
train_val_y = train_val_labels[labels]

X_train, X_val, y_train, y_val = train_test_split(train_val_X, train_val_y, test_size=0.2, random_state=42)

# combined the training sets and validation sets
train_labels = pd.concat([X_train, y_train],axis=1).reindex(columns=column_order)
val_labels = pd.concat([X_val, y_val],axis=1).reindex(columns=column_order)

display(train_labels.head())
display(val_labels.head())


Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
5248,00001739_000.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1739,57,F,PA
34668,00010741_016.png,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,16,10741,81,F,PA
14188,00004552_000.png,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4552,56,M,PA
70256,00022090_000.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,22090,30,F,PA
69107,00021627_005.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,5,21627,42,M,AP


Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
49657,00015355_007.png,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,7,15355,69,M,AP
43843,00013572_004.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,4,13572,61,F,AP
84853,00029465_002.png,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,2,29465,55,F,PA
28824,00009001_004.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,4,9001,48,F,PA
11597,00003616_000.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,3616,52,M,PA


In [126]:
# export files to the labels folder

train_labels.to_csv('../data/labels/train_labels.csv')
val_labels.to_csv('../data/labels/val_labels.csv')
test_labels.to_csv('../data/labels/test_labels.csv')

## Extract Raw Data

In [68]:
# [add folder numbers to the dataset]

"""

# extracted the targeted images
targeted_imgs = list(data_entry_df["image_index"])

# extract only the images folders paths from kaggle and store it in a list
folder_paths = []

for folder in os.listdir(path):

    if folder.startswith("images") == True:
        f_path = os.path.join(path, folder, "images")
        folder_paths.append(f_path)


# search folders based on targeted images
parent_folders = []

for img in targeted_imgs:
    found = False
    for folder in folder_paths:
        if img in os.listdir(folder):
            parent_folders.append(os.path.basename(os.path.dirname(folder)))
            found = True
            break  
    if not found:
        print(f"Not found: {img}")

# create a dataframe 
parent_folders_df = pd.DataFrame({
    "image_index": targeted_imgs,
    "folders": parent_folders
})



"""

'\n\n# extracted the targeted images\ntargeted_imgs = list(data_entry_df["image_index"])\n\n# extract only the images folders paths from kaggle and store it in a list\nfolder_paths = []\n\nfor folder in os.listdir(path):\n\n    if folder.startswith("images") == True:\n        f_path = os.path.join(path, folder, "images")\n        folder_paths.append(f_path)\n\n\n# search folders based on targeted images\nparent_folders = []\n\nfor img in targeted_imgs:\n    found = False\n    for folder in folder_paths:\n        if img in os.listdir(folder):\n            parent_folders.append(os.path.basename(os.path.dirname(folder)))\n            found = True\n            break  \n    if not found:\n        print(f"Not found: {img}")\n\n# create a dataframe \nparent_folders_df = pd.DataFrame({\n    "image_index": targeted_imgs,\n    "folders": parent_folders\n})\n\n\n\n'