# Create a test.csv from train.csv

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
import os
from pathlib import Path
from sklearn.model_selection import train_test_split


#### Data Preprocessing

In [20]:
cwd_path = Path.cwd()
metadata_path = cwd_path.parent.joinpath("../image_data/CheXpert-v1.0-small/train.csv")
metadata = pd.read_csv(metadata_path)

# Removing the single 'Unknown' value in 'Sex' 
metadata = metadata[metadata['Sex'] != 'Unknown']

In [21]:
# Define the columns to map NaN values to -1.0
columns_to_map = [
    "No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly", 
    "Lung Opacity", "Lung Lesion", "Edema", "Consolidation", 
    "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", 
    "Pleural Other", "Fracture", "Support Devices"
]

# Use .loc[] to modify the original DataFrame
metadata.loc[:, columns_to_map] = metadata[columns_to_map].fillna(-1.0)

In [22]:
metadata['Sex_mapped'] = pd.factorize(metadata['Sex'])[0]
metadata['Frontal/Lateral_mapped'] = pd.factorize(metadata['Frontal/Lateral'])[0]
metadata['AP/PA_mapped'] = pd.factorize(metadata['AP/PA'])[0]

In [23]:
metadata.isna().sum()

# AP/PA is NULL when Frontal/Lateral is Lateral

Path                              0
Sex                               0
Age                               0
Frontal/Lateral                   0
AP/PA                         32387
No Finding                        0
Enlarged Cardiomediastinum        0
Cardiomegaly                      0
Lung Opacity                      0
Lung Lesion                       0
Edema                             0
Consolidation                     0
Pneumonia                         0
Atelectasis                       0
Pneumothorax                      0
Pleural Effusion                  0
Pleural Other                     0
Fracture                          0
Support Devices                   0
Sex_mapped                        0
Frontal/Lateral_mapped            0
AP/PA_mapped                      0
dtype: int64

In [27]:
# Step 1: Perform a 95/5 Train-Val Split
train_df, valid_df = train_test_split(
    metadata, 
    test_size=0.05, 
    random_state=42, 
    shuffle=True)


In [28]:
train_df_ap_pa = train_df[train_df['AP/PA'].isin(['AP', 'PA'])].reset_index(drop=True)
valid_df_ap_pa = valid_df[valid_df['AP/PA'].isin(['AP','PA'])].reset_index(drop=True)

In [29]:
train_df_ap_pa.isna().sum()

Path                          0
Sex                           0
Age                           0
Frontal/Lateral               0
AP/PA                         0
No Finding                    0
Enlarged Cardiomediastinum    0
Cardiomegaly                  0
Lung Opacity                  0
Lung Lesion                   0
Edema                         0
Consolidation                 0
Pneumonia                     0
Atelectasis                   0
Pneumothorax                  0
Pleural Effusion              0
Pleural Other                 0
Fracture                      0
Support Devices               0
Sex_mapped                    0
Frontal/Lateral_mapped        0
AP/PA_mapped                  0
dtype: int64

In [30]:
valid_df_ap_pa.isna().sum()

Path                          0
Sex                           0
Age                           0
Frontal/Lateral               0
AP/PA                         0
No Finding                    0
Enlarged Cardiomediastinum    0
Cardiomegaly                  0
Lung Opacity                  0
Lung Lesion                   0
Edema                         0
Consolidation                 0
Pneumonia                     0
Atelectasis                   0
Pneumothorax                  0
Pleural Effusion              0
Pleural Other                 0
Fracture                      0
Support Devices               0
Sex_mapped                    0
Frontal/Lateral_mapped        0
AP/PA_mapped                  0
dtype: int64

In [31]:
## Split and create .csv for the training of the segments 

train_df_ap = train_df_ap_pa[train_df_ap_pa['AP/PA'] == 'AP']
train_df_pa = train_df_ap_pa[train_df_ap_pa['AP/PA'] == 'PA']
valid_df_ap = valid_df_ap_pa[valid_df_ap_pa['AP/PA'] == 'AP']
valid_df_pa = valid_df_ap_pa[valid_df_ap_pa['AP/PA'] == 'PA']
train_df_fr = train_df[train_df['Frontal/Lateral'] == 'Frontal']
train_df_lat = train_df[train_df['Frontal/Lateral'] == 'Lateral']
valid_df_fr = valid_df[valid_df['Frontal/Lateral'] == 'Frontal']
valid_df_lat = valid_df[valid_df['Frontal/Lateral'] == 'Lateral']

In [35]:
## Let's see if valid df_lat and valid_df_pa are representative enough for our 5 labels of interest

valid_df_lat['Cardiomegaly'].value_counts()

Cardiomegaly
-1.0    1272
 1.0     174
 0.0     138
Name: count, dtype: int64

In [36]:
valid_df_lat['Consolidation'].value_counts()

Consolidation
-1.0    1069
 0.0     418
 1.0      97
Name: count, dtype: int64

In [37]:
valid_df_lat['Pleural Effusion'].value_counts()

Pleural Effusion
-1.0    629
 0.0    505
 1.0    450
Name: count, dtype: int64

In [38]:
valid_df_lat['Edema'].value_counts()

Edema
-1.0    1219
 0.0     248
 1.0     117
Name: count, dtype: int64

In [39]:
valid_df_lat['Atelectasis'].value_counts()

Atelectasis
-1.0    1404
 1.0     166
 0.0      14
Name: count, dtype: int64

In [34]:
valid_df_pa['Cardiomegaly'].value_counts()

Cardiomegaly
-1.0    1142
 0.0     142
 1.0     140
Name: count, dtype: int64

In [40]:
valid_df_pa['Consolidation'].value_counts()

Consolidation
-1.0    989
 0.0    355
 1.0     80
Name: count, dtype: int64

In [41]:
valid_df_pa['Pleural Effusion'].value_counts()

Pleural Effusion
-1.0    546
 0.0    452
 1.0    426
Name: count, dtype: int64

In [42]:
valid_df_pa['Edema'].value_counts()

Edema
-1.0    1156
 0.0     195
 1.0      73
Name: count, dtype: int64

In [43]:
valid_df_pa['Atelectasis'].value_counts()

Atelectasis
-1.0    1263
 1.0     144
 0.0      17
Name: count, dtype: int64

In [44]:
# Define the path 
output_dir = os.path.join(cwd_path.parent, 'data', 'splitted')

# Create the 'splitted' directory
os.makedirs(output_dir, exist_ok=True)

# Save these subsets to CSVs
train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
valid_df.to_csv(os.path.join(output_dir, 'valid.csv'), index=False)
train_df_ap.to_csv(os.path.join(output_dir, 'ap_train.csv'), index=False)
train_df_pa.to_csv(os.path.join(output_dir, 'pa_train.csv'), index=False)
valid_df_ap.to_csv(os.path.join(output_dir, 'ap_valid.csv'), index=False)
valid_df_pa.to_csv(os.path.join(output_dir, 'pa_valid.csv'), index=False)
train_df_fr.to_csv(os.path.join(output_dir, 'fr_train.csv'), index=False)
train_df_lat.to_csv(os.path.join(output_dir, 'lat_train.csv'), index=False)
valid_df_fr.to_csv(os.path.join(output_dir, 'fr_valid.csv'), index=False)
valid_df_lat.to_csv(os.path.join(output_dir, 'lat_valid.csv'), index=False)


print("Datasets have been saved.")

Datasets have been saved.


In [15]:
#TODO Preprocess test set and add it 