# Data Preprocessing and Splitting
---

In [2]:
# import libraries
import os
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

import sys
sys.path.append('../scripts')
from data_utils import path, column_order, labels
from extract_imgs import find_img_folder, locate_imgs



In [11]:
# load data

data_entry_df = pd.read_pickle('../data/interim/data_entry_df.pkl')
patient_data = pd.read_pickle('../data/interim/patient_data.pkl')
train_val_labels = pd.read_csv('../data/labels/train_val_labels.csv', index_col=0)
test_labels = pd.read_csv('../data/labels/test_labels.csv', index_col=0)

In [12]:
# encode patient_gender and view_position columns (test_label)

test_labels["patient_gender"] = label_encoder.fit_transform(test_labels["patient_gender"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

test_labels["view_position"] = label_encoder.fit_transform(test_labels["view_position"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

test_labels[["patient_gender", "view_position"]].head()

Category mapping:
F: 0
M: 1
Category mapping:
AP: 0
PA: 1


Unnamed: 0,patient_gender,view_position
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [None]:
# encode patient_gender and view_position columns (train_val_label)

train_val_labels["patient_gender"] = label_encoder.fit_transform(train_val_labels["patient_gender"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

train_val_labels["view_position"] = label_encoder.fit_transform(train_val_labels["view_position"])
print("Category mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category}: {i}")

train_val_labels[["patient_gender", "view_position"]].head()

Category mapping:
F: 0
M: 1
Category mapping:
AP: 0
PA: 1


Unnamed: 0,patient_gender,view_position
0,1,1
1,1,1
2,1,1
3,1,1
4,1,0


### Data Splitting
- Note: Data splits have provided by the NIH (train_val_list.txt and test_list.txt), spliting of the train_val set into training and validation sets is still neccessary


- Split "test_labels" and "train_val_labels" into two sets: image data and tabular data
- The two dataset will be used to train two different models and eventually be combined (multimodal learning)

In [14]:
# replace "image_index" column with file path to each image
test_labels = find_img_folder(test_labels)
train_val_labels = find_img_folder(train_val_labels)

def add_image_path(dataset):
    
    paths = []

    for index, img in enumerate(dataset["image_index"]):
        folder = dataset.iloc[index,-1]
        img_path = os.path.join(path, folder, "images", img)
        paths.append(img_path)
    
    dataset.drop(columns="image_index", inplace=True)
    dataset.insert(0, "image_path", paths)
        

add_image_path(test_labels)
add_image_path(train_val_labels)

In [15]:
# performed an 80-20 split based on unique patient id for the train_val set 
# prevents data leakage - all images from one patient must either be in the training set or validation set, not both

train_ids, val_ids = train_test_split(train_val_labels["patient_id"].unique(), test_size=0.2, random_state=42)
train_labels = train_val_labels[train_val_labels["patient_id"].isin(train_ids)].reset_index(drop=True)
val_labels = train_val_labels[train_val_labels["patient_id"].isin(val_ids)].reset_index(drop=True)

## Testing (sampling only 5000k images )

In [16]:
train_labels.shape

(69354, 22)

In [17]:
val_labels.shape

(17157, 22)

In [18]:
test_labels.shape

(25591, 22)

In [19]:
# Group images by patient in train and val
train_groups = train_labels.groupby('patient_id')
val_groups = val_labels.groupby('patient_id')


In [20]:
def patient_label_agg(df):
    return df.groupby('patient_id').agg({col: 'max' for col in labels}).reset_index()

train_patient_labels = patient_label_agg(train_labels)
val_patient_labels = patient_label_agg(val_labels)

In [21]:
from skmultilearn.model_selection import iterative_train_test_split
import numpy as np

# Define how many patients you want in your sampled subset
num_train_patients = 2000  # Adjust based on average images per patient to get ~5k-10k images
num_val_patients = 500

# Prepare X and y for train
X_train_patients = train_patient_labels['patient_id'].values.reshape(-1, 1)
y_train_labels = train_patient_labels[labels].values

# Sample subset of train patients with balanced labels using iterative stratification
X_train_sampled, y_train_sampled, _, _ = iterative_train_test_split(
    X_train_patients, y_train_labels, test_size = 1 - (num_train_patients / len(X_train_patients))
)

train_sampled_patients = X_train_sampled.flatten()

# Same for validation
X_val_patients = val_patient_labels['patient_id'].values.reshape(-1, 1)
y_val_labels = val_patient_labels[labels].values

X_val_sampled, y_val_sampled, _, _ = iterative_train_test_split(
    X_val_patients, y_val_labels, test_size = 1 - (num_val_patients / len(X_val_patients))
)

val_sampled_patients = X_val_sampled.flatten()


In [22]:
train_sampled_df = train_labels[train_labels['patient_id'].isin(train_sampled_patients)].reset_index(drop=True)
val_sampled_df = val_labels[val_labels['patient_id'].isin(val_sampled_patients)].reset_index(drop=True)

In [23]:
train_sampled_df.shape

(6678, 22)

In [24]:
val_sampled_df.shape

(1540, 22)

In [25]:
test_labels.shape

(25591, 22)

In [26]:
test_cnn = test_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
test_tab = test_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

train_cnn = train_sampled_df.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
train_tab = train_sampled_df.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

val_cnn = val_sampled_df.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
val_tab = val_sampled_df.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

In [27]:
target_cols = ["follow_up_number", "patient_age"]

scaler = StandardScaler()
scaler.fit(train_tab[target_cols])

train_tab[target_cols] = scaler.transform(train_tab[target_cols])
val_tab[target_cols] = scaler.transform(val_tab[target_cols])
test_tab[target_cols] = scaler.transform(test_tab[target_cols])

In [46]:
# Split tabular datasets into X and y
test_tab_X = test_tab.drop(columns=[feature for feature in labels], axis=1)
test_tab_y = test_tab[labels]

train_tab_X = train_tab.drop(columns=[feature for feature in labels], axis=1)
train_tab_y = train_tab[labels]

val_tab_X = val_tab.drop(columns=[feature for feature in labels], axis=1)
val_tab_y = val_tab[labels]

In [47]:
# Split CNN dataset into X and y

test_cnn_X = test_cnn.drop(columns=[feature for feature in labels], axis=1)
test_cnn_y = test_cnn[labels]

train_cnn_X = train_cnn.drop(columns=[feature for feature in labels], axis=1)
train_cnn_y = train_cnn[labels]

val_cnn_X = val_cnn.drop(columns=[feature for feature in labels], axis=1)
val_cnn_y = val_cnn[labels]

In [None]:
test_cnn_X.to_csv('../data/labels/test_cnn_X.csv')
test_cnn_y.to_csv('../data/labels/test_cnn_y.csv')
test_tab_X.to_csv('../data/labels/test_tab_X.csv')
test_tab_y.to_csv('../data/labels/test_tab_y.csv')

train_cnn_X.to_csv('../data/labels/train_cnn_X.csv')
train_cnn_y.to_csv('../data/labels/train_cnn_y.csv')
train_tab_X.to_csv('../data/labels/train_tab_X.csv')
train_tab_y.to_csv('../data/labels/train_tab_y.csv')

val_cnn_X.to_csv('../data/labels/val_cnn_X.csv')
val_cnn_y.to_csv('../data/labels/val_cnn_y.csv')
val_tab_X.to_csv('../data/labels/val_tab_X.csv')
val_tab_y.to_csv('../data/labels/val_tab_y.csv')

## checkpoint

In [11]:
# split data into image and tabular sets

test_cnn = test_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
test_tab = test_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

train_cnn = train_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
train_tab = train_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

val_cnn = val_labels.drop(columns=["follow_up_number", "patient_id", "patient_age", "patient_gender", "view_position", "src_folder"], axis=1)
val_tab = val_labels.drop(columns=["image_path","patient_id", "src_folder"], axis=1)

In [12]:
# standardlize patient_age and follow_up_number features

target_cols = ["follow_up_number", "patient_age"]

scaler = StandardScaler()
scaler.fit(train_tab[target_cols])

train_tab[target_cols] = scaler.transform(train_tab[target_cols])
val_tab[target_cols] = scaler.transform(val_tab[target_cols])
test_tab[target_cols] = scaler.transform(test_tab[target_cols])

In [13]:
# Split tabular datasets into X and y
test_tab_X = test_tab.drop(columns=[feature for feature in labels], axis=1)
test_tab_y = test_tab[labels]

train_tab_X = train_tab.drop(columns=[feature for feature in labels], axis=1)
train_tab_y = train_tab[labels]

val_tab_X = val_tab.drop(columns=[feature for feature in labels], axis=1)
val_tab_y = val_tab[labels]

In [14]:
# Split CNN dataset into X and y

test_cnn_X = test_cnn.drop(columns=[feature for feature in labels], axis=1)
test_cnn_y = test_cnn[labels]

train_cnn_X = train_cnn.drop(columns=[feature for feature in labels], axis=1)
train_cnn_y = train_cnn[labels]

val_cnn_X = val_cnn.drop(columns=[feature for feature in labels], axis=1)
val_cnn_y = val_cnn[labels]

In [48]:
# export files to the labels folder

test_cnn_X.to_csv('../data/labels/test_cnn_X.csv')
test_cnn_y.to_csv('../data/labels/test_cnn_y.csv')
test_tab_X.to_csv('../data/labels/test_tab_X.csv')
test_tab_y.to_csv('../data/labels/test_tab_y.csv')

train_cnn_X.to_csv('../data/labels/train_cnn_X.csv')
train_cnn_y.to_csv('../data/labels/train_cnn_y.csv')
train_tab_X.to_csv('../data/labels/train_tab_X.csv')
train_tab_y.to_csv('../data/labels/train_tab_y.csv')

val_cnn_X.to_csv('../data/labels/val_cnn_X.csv')
val_cnn_y.to_csv('../data/labels/val_cnn_y.csv')
val_tab_X.to_csv('../data/labels/val_tab_X.csv')
val_tab_y.to_csv('../data/labels/val_tab_y.csv')