# Data Preprocessing and Splitting
---

In [64]:
# import libraries
import os
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

import sys
sys.path.append('../scripts')
from data_utils import path, column_order, labels
from extract_imgs import find_img_folder, locate_imgs

In [65]:
# load data

data_entry_df = pd.read_pickle('../data/interim/data_entry_df.pkl')
train_val_list = pd.read_pickle('../data/interim/train_val_list.pkl')
test_list = pd.read_pickle('../data/interim/test_list.pkl')
patient_data = pd.read_pickle('../data/interim/patient_data.pkl')

In [66]:
# show unique classes
labels_exploded = data_entry_df["finding_labels"].str.split('|').explode()
sorted(labels_exploded.unique())

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'No Finding',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

### Encode Features

In [67]:
# inner joined data between data_entry_df and both test and training datasets
# created a new filtered dataset where it includes data from both datasets only where 'image_index' matches both datasets

filtered_test_list = data_entry_df.merge(test_list, on='image_index', how='inner')
filtered_train_val_list = data_entry_df.merge(train_val_list, on='image_index', how='inner')

display(filtered_test_list.head())
display(filtered_train_val_list.head())

Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143
1,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168
2,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168
3,00000003_003.png,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143
4,00000003_004.png,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168


Unnamed: 0,image_index,finding_labels,follow_up_number,patient_id,patient_age,patient_gender,view_position,original_img_width,original_img_height,img_pixel_spacing_x,img_pixel_spacing_y
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171
4,00000004_000.png,Mass|Nodule,0,4,82,M,AP,2500,2048,0.168,0.168


In [68]:
# encode target variable classes (train_val_list)
train_X = filtered_train_val_list.drop("finding_labels", axis=1)
train_y = filtered_train_val_list["finding_labels"]

# y = target variable
# X = features

def create_labels(X, y):
    mlb = MultiLabelBinarizer()

    y_split = y.str.split('|')
    y_encoded = mlb.fit_transform(y_split)
    classes = mlb.classes_

    encoded_df = pd.DataFrame(columns=classes, data=y_encoded)
    labeled_df = X.join(encoded_df,how="inner").drop(columns=['original_img_height', 'img_pixel_spacing_x', 'img_pixel_spacing_y'], axis=1)
    labeled_df = labeled_df.reindex(columns=column_order)
    labeled_df[labels] = labeled_df[labels].fillna(0).astype(int) #fill missing data with 0

    return labeled_df


train_val_labels = create_labels(train_X, train_y)
train_val_labels.head()

Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
0,00000001_000.png,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,58,M,PA
1,00000001_001.png,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,1,1,58,M,PA
2,00000001_002.png,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,2,1,58,M,PA
3,00000002_000.png,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,2,81,M,PA
4,00000004_000.png,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4,82,M,AP


In [69]:
# encode target variable classes (test_list)
test_X = filtered_test_list.drop("finding_labels", axis=1)
test_y = filtered_test_list["finding_labels"]

test_labels = create_labels(test_X,test_y)
test_labels.head()

Unnamed: 0,image_index,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,...,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,follow_up_number,patient_id,patient_age,patient_gender,view_position
0,00000003_000.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,3,81,F,PA
1,00000003_001.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,3,74,F,PA
2,00000003_002.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,2,3,75,F,PA
3,00000003_003.png,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,3,3,76,F,PA
4,00000003_004.png,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,4,3,77,F,PA


In [70]:
# emcode patient_gender and view_position columns (test_label)

test_labels["patient_gender"] = label_encoder.fit_transform(test_labels["patient_gender"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

test_labels["view_position"] = label_encoder.fit_transform(test_labels["view_position"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

test_labels[["patient_gender", "view_position"]].head()

Category mapping:
F: 0
M: 1
Category mapping:
AP: 0
PA: 1


Unnamed: 0,patient_gender,view_position
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [71]:
# emcode patient_gender and view_position columns (train_val_label)

train_val_labels["patient_gender"] = label_encoder.fit_transform(train_val_labels["patient_gender"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

train_val_labels["view_position"] = label_encoder.fit_transform(train_val_labels["view_position"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

train_val_labels[["patient_gender", "view_position"]].head()

Category mapping:
F: 0
M: 1
Category mapping:
AP: 0
PA: 1


Unnamed: 0,patient_gender,view_position
0,1,1
1,1,1
2,1,1
3,1,1
4,1,0


### Data Splitting
- Note: Data splits have provided by the NIH (train_val_list.txt and test_list.txt), spliting of the train_val set into training and validation sets is still neccessary


- Split "test_labels" and "train_val_labels" into two sets: image data and tabular data
- The two dataset will be used to train two different models and eventually be combined (multimodal learning)

In [72]:
# replace "image_index" column with file path to each image
test_labels = find_img_folder(test_labels)
train_val_labels = find_img_folder(train_val_labels)

def add_image_path(dataset):
    
    paths = []

    for index, img in enumerate(dataset["image_index"]):
        folder = dataset.iloc[index,-1]
        img_path = os.path.join(path, folder, "images", img)
        paths.append(img_path)
    
    dataset.drop(columns="image_index", inplace=True)
    dataset.insert(0, "image_path", paths)
        

add_image_path(test_labels)
add_image_path(train_val_labels)

In [73]:
# performed an 80-20 split based on unique patient id for the train_val set 
# prevents data leakage - all images from one patient must either be in the training set or validation set, not both

train_ids, val_ids = train_test_split(train_val_labels["patient_id"].unique(), test_size=0.2, random_state=42)
train_labels = train_val_labels[train_val_labels["patient_id"].isin(train_ids)].reset_index(drop=True)
val_labels = train_val_labels[train_val_labels["patient_id"].isin(val_ids)].reset_index(drop=True)

In [74]:
# split data into image and tabular sets

test_cnn = test_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
test_tab = test_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

train_cnn = train_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
train_tab = train_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

val_cnn = val_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
val_tab = val_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

In [75]:
# standardlize patient_age and follow_up_number features

target_cols = ["follow_up_number", "patient_age"]

scaler = StandardScaler()
scaler.fit(train_tab[target_cols])

train_tab[target_cols] = scaler.transform(train_tab[target_cols])
val_tab[target_cols] = scaler.transform(val_tab[target_cols])
test_tab[target_cols] = scaler.transform(test_tab[target_cols])

In [76]:
# Split tabular datasets into X and y
test_tab_X = test_tab.drop(columns=[feature for feature in labels], axis=1)
test_tab_y = test_tab[labels]

train_tab_X = train_tab.drop(columns=[feature for feature in labels], axis=1)
train_tab_y = train_tab[labels]

val_tab_X = val_tab.drop(columns=[feature for feature in labels], axis=1)
val_tab_y = val_tab[labels]

In [77]:
# Split CNN dataset into X and y

test_cnn_X = test_cnn.drop(columns=[feature for feature in labels], axis=1)
test_cnn_y = test_cnn[labels]

train_cnn_X = train_cnn.drop(columns=[feature for feature in labels], axis=1)
train_cnn_y = train_cnn[labels]

val_cnn_X = val_cnn.drop(columns=[feature for feature in labels], axis=1)
val_cnn_y = val_cnn[labels]

In [78]:
# export files to the labels folder

test_cnn_X.to_csv('../data/labels/test_cnn_X.csv')
test_cnn_y.to_csv('../data/labels/test_cnn_y.csv')
test_tab_X.to_csv('../data/labels/test_tab_X.csv')
test_tab_y.to_csv('../data/labels/test_tab_y.csv')

train_cnn_X.to_csv('../data/labels/train_cnn_X.csv')
train_cnn_y.to_csv('../data/labels/train_cnn_y.csv')
train_tab_X.to_csv('../data/labels/train_tab_X.csv')
train_tab_y.to_csv('../data/labels/train_tab_y.csv')

val_cnn_X.to_csv('../data/labels/val_cnn_X.csv')
val_cnn_y.to_csv('../data/labels/val_cnn_y.csv')
val_tab_X.to_csv('../data/labels/val_tab_X.csv')
val_tab_y.to_csv('../data/labels/val_tab_y.csv')