# Create a test.csv from train.csv

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
from pathlib import Path

In [3]:
# Load metadata

cwd_path = Path.cwd()
train_metadata_path = cwd_path.parent.joinpath("../image_data/CheXpert-v1.0-small/train.csv")
train_metadata = pd.read_csv(train_metadata_path)
valid_metadata_path = cwd_path.parent.joinpath("../image_data/CheXpert-v1.0-small/valid.csv")
valid_metadata = pd.read_csv(valid_metadata_path)

# Check the first few rows
print(train_metadata.head())

                                                Path     Sex  Age  \
0  CheXpert-v1.0-small/train/patient00001/study1/...  Female   68   
1  CheXpert-v1.0-small/train/patient00002/study2/...  Female   87   
2  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
3  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
4  CheXpert-v1.0-small/train/patient00003/study1/...    Male   41   

  Frontal/Lateral AP/PA  No Finding  Enlarged Cardiomediastinum  Cardiomegaly  \
0         Frontal    AP         1.0                         NaN           NaN   
1         Frontal    AP         NaN                         NaN          -1.0   
2         Frontal    AP         NaN                         NaN           NaN   
3         Lateral   NaN         NaN                         NaN           NaN   
4         Frontal    AP         NaN                         NaN           NaN   

   Lung Opacity  Lung Lesion  Edema  Consolidation  Pneumonia  Atelectasis  \
0           NaN     

In [4]:
print(valid_metadata.head())

                                                Path     Sex  Age  \
0  CheXpert-v1.0-small/valid/patient64541/study1/...    Male   73   
1  CheXpert-v1.0-small/valid/patient64542/study1/...    Male   70   
2  CheXpert-v1.0-small/valid/patient64542/study1/...    Male   70   
3  CheXpert-v1.0-small/valid/patient64543/study1/...    Male   85   
4  CheXpert-v1.0-small/valid/patient64544/study1/...  Female   42   

  Frontal/Lateral AP/PA  No Finding  Enlarged Cardiomediastinum  Cardiomegaly  \
0         Frontal    AP         0.0                         1.0           1.0   
1         Frontal    PA         0.0                         0.0           0.0   
2         Lateral   NaN         0.0                         0.0           0.0   
3         Frontal    AP         0.0                         1.0           0.0   
4         Frontal    AP         1.0                         0.0           0.0   

   Lung Opacity  Lung Lesion  Edema  Consolidation  Pneumonia  Atelectasis  \
0           1.0     

In [5]:
### "Patient_ID" could be useful in the future (if we decide to remove rows with missing values or w/e), so adding it as a column in metadata 

# Function to extract patient ID from the image path
def extract_patient_id(path):
    return path.split('/')[2]  # Adjust the index based on your path structure

# Apply the function
train_metadata['Patient_ID'] = train_metadata['Path'].apply(extract_patient_id)

# Display the updated df
print(train_metadata.head())

                                                Path     Sex  Age  \
0  CheXpert-v1.0-small/train/patient00001/study1/...  Female   68   
1  CheXpert-v1.0-small/train/patient00002/study2/...  Female   87   
2  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
3  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
4  CheXpert-v1.0-small/train/patient00003/study1/...    Male   41   

  Frontal/Lateral AP/PA  No Finding  Enlarged Cardiomediastinum  Cardiomegaly  \
0         Frontal    AP         1.0                         NaN           NaN   
1         Frontal    AP         NaN                         NaN          -1.0   
2         Frontal    AP         NaN                         NaN           NaN   
3         Lateral   NaN         NaN                         NaN           NaN   
4         Frontal    AP         NaN                         NaN           NaN   

   Lung Opacity  Lung Lesion  Edema  Consolidation  Pneumonia  Atelectasis  \
0           NaN     

In [6]:
# Removing the single 'Unknown' value in 'Sex' 
train_metadata = train_metadata[train_metadata['Sex'] != 'Unknown']

In [7]:
# Define the columns to map NaN values to -1.0
columns_to_map = [
    "No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly", 
    "Lung Opacity", "Lung Lesion", "Edema", "Consolidation", 
    "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", 
    "Pleural Other", "Fracture", "Support Devices"
]

# Use .loc[] to modify the original DataFrame
train_metadata.loc[:, columns_to_map] = train_metadata[columns_to_map].fillna(-1.0)

In [27]:
train_metadata['Sex_mapped'] = pd.factorize(train_metadata['Sex'])[0]
train_metadata['Frontal/Lateral_mapped'] = pd.factorize(train_metadata['Frontal/Lateral'])[0]
train_metadata['AP/PA_mapped'] = pd.factorize(train_metadata['AP/PA'])[0]

In [31]:
valid_metadata['Sex_mapped'] = pd.factorize(valid_metadata['Sex'])[0]
valid_metadata['Frontal/Lateral_mapped'] = pd.factorize(valid_metadata['Frontal/Lateral'])[0]
valid_metadata['AP/PA_mapped'] = pd.factorize(valid_metadata['AP/PA'])[0]

In [28]:
train_metadata.isna().sum()

Path                              0
Sex                               0
Age                               0
Frontal/Lateral                   0
AP/PA                         32387
No Finding                        0
Enlarged Cardiomediastinum        0
Cardiomegaly                      0
Lung Opacity                      0
Lung Lesion                       0
Edema                             0
Consolidation                     0
Pneumonia                         0
Atelectasis                       0
Pneumothorax                      0
Pleural Effusion                  0
Pleural Other                     0
Fracture                          0
Support Devices                   0
Patient_ID                        0
Sex_mapped                        0
Frontal/Lateral_mapped            0
AP/PA_mapped                      0
dtype: int64

In [32]:
valid_metadata.isna().sum()

Path                           0
Sex                            0
Age                            0
Frontal/Lateral                0
AP/PA                         32
No Finding                     0
Enlarged Cardiomediastinum     0
Cardiomegaly                   0
Lung Opacity                   0
Lung Lesion                    0
Edema                          0
Consolidation                  0
Pneumonia                      0
Atelectasis                    0
Pneumothorax                   0
Pleural Effusion               0
Pleural Other                  0
Fracture                       0
Support Devices                0
Sex_mapped                     0
Frontal/Lateral_mapped         0
AP/PA_mapped                   0
dtype: int64

In [33]:
train_metadata_ap_pa = train_metadata[train_metadata['AP/PA'].isin(['AP', 'PA'])].reset_index(drop=True)
valid_metadata_ap_pa = valid_metadata[valid_metadata['AP/PA'].isin(['AP/PA'])].reset_index(drop=True)

In [34]:
train_metadata_ap_pa.isna().sum()

Path                          0
Sex                           0
Age                           0
Frontal/Lateral               0
AP/PA                         0
No Finding                    0
Enlarged Cardiomediastinum    0
Cardiomegaly                  0
Lung Opacity                  0
Lung Lesion                   0
Edema                         0
Consolidation                 0
Pneumonia                     0
Atelectasis                   0
Pneumothorax                  0
Pleural Effusion              0
Pleural Other                 0
Fracture                      0
Support Devices               0
Patient_ID                    0
Sex_mapped                    0
Frontal/Lateral_mapped        0
AP/PA_mapped                  0
dtype: int64

In [35]:
valid_metadata_ap_pa.isna().sum()

Path                          0
Sex                           0
Age                           0
Frontal/Lateral               0
AP/PA                         0
No Finding                    0
Enlarged Cardiomediastinum    0
Cardiomegaly                  0
Lung Opacity                  0
Lung Lesion                   0
Edema                         0
Consolidation                 0
Pneumonia                     0
Atelectasis                   0
Pneumothorax                  0
Pleural Effusion              0
Pleural Other                 0
Fracture                      0
Support Devices               0
Sex_mapped                    0
Frontal/Lateral_mapped        0
AP/PA_mapped                  0
dtype: int64

In [38]:
train_metadata_ap_pa['AP/PA_mapped'].value_counts()

AP/PA_mapped
0    161590
1     29419
Name: count, dtype: int64

In [39]:
## Split and create two .csv for the training of the segments 

train_metadata_ap = train_metadata_ap_pa[train_metadata_ap_pa['AP/PA'] == 'AP']
train_metadata_pa = train_metadata_ap_pa[train_metadata_ap_pa['AP/PA'] == 'PA']
valid_metadata_ap = valid_metadata_ap_pa[valid_metadata_ap_pa['AP/PA'] == 'AP']
valid_metadata_pa = valid_metadata_ap_pa[valid_metadata_ap_pa['AP/PA'] == 'PA']
train_metadata_fr = train_metadata[train_metadata['Frontal/Lateral'] == 'Frontal']
train_metadata_lat = train_metadata[train_metadata['Frontal/Lateral'] == 'Lateral']
valid_metadata_fr = valid_metadata[valid_metadata['Frontal/Lateral'] == 'Frontal']
valid_metadata_lat = valid_metadata[valid_metadata['Frontal/Lateral'] == 'Lateral']


# Define the path 
output_dir = os.path.join(cwd_path.parent, 'data', 'splitted')

# Create the 'splitted' directory
os.makedirs(output_dir, exist_ok=True)

# Save these subsets to CSVs
train_metadata.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
valid_metadata.to_csv(os.path.join(output_dir, 'valid.csv'), index=False)
train_metadata_ap.to_csv(os.path.join(output_dir, 'ap_train.csv'), index=False)
train_metadata_pa.to_csv(os.path.join(output_dir, 'pa_train.csv'), index=False)
valid_metadata_ap.to_csv(os.path.join(output_dir, 'ap_valid.csv'), index=False)
valid_metadata_pa.to_csv(os.path.join(output_dir, 'pa_valid.csv'), index=False)
train_metadata_fr.to_csv(os.path.join(output_dir, 'fr_train.csv'), index=False)
train_metadata_lat.to_csv(os.path.join(output_dir, 'lat_train.csv'), index=False)
valid_metadata_fr.to_csv(os.path.join(output_dir, 'fr_valid.csv'), index=False)
valid_metadata_lat.to_csv(os.path.join(output_dir, 'lat_valid.csv'), index=False)


print("Datasets have been saved.")

Datasets have been saved.


In [40]:
#TODO Preprocess test set and add it 