In [1]:
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import pandas as pd
import numpy as np
import cv2
import os

In [12]:
# Load metadata
# data_path = Path.cwd().parent.parent.joinpath("image_data/CheXpert-v1.0-small")
data_path = Path.cwd().parent.joinpath("data")
train_metadata_path = data_path.joinpath("train.csv")
valid_metadata_path = data_path.joinpath("valid.csv")
train_metadata = pd.read_csv(train_metadata_path)
valid_metadata = pd.read_csv(valid_metadata_path)


# Check the first few rows
print("Train metadata")
print(train_metadata.head())

print("Valid metadata")
print(valid_metadata.head())

Train metadata
                                                Path     Sex  Age  \
0  CheXpert-v1.0-small/train/patient00001/study1/...  Female   68   
1  CheXpert-v1.0-small/train/patient00002/study2/...  Female   87   
2  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
3  CheXpert-v1.0-small/train/patient00003/study1/...    Male   41   
4  CheXpert-v1.0-small/train/patient00004/study1/...  Female   20   

  Frontal/Lateral AP/PA  No Finding  Enlarged Cardiomediastinum  Cardiomegaly  \
0         Frontal    AP         1.0                        -1.0          -1.0   
1         Frontal    AP        -1.0                        -1.0          -1.0   
2         Frontal    AP        -1.0                        -1.0          -1.0   
3         Frontal    AP        -1.0                        -1.0          -1.0   
4         Frontal    PA         1.0                         0.0          -1.0   

   Lung Opacity  Lung Lesion  ...  Atelectasis  Pneumothorax  \
0          -1.0    

In [13]:
print(train_metadata.columns)
print(valid_metadata.columns)

Index(['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices', 'Patient_ID', 'Sex_mapped', 'Frontal/Lateral_mapped',
       'AP/PA_mapped'],
      dtype='object')
Index(['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices'],
      dtype='object')


In [15]:
print(train_metadata['AP/PA'].value_counts())
print(valid_metadata['AP/PA'].value_counts())

AP/PA
AP    161590
PA     29419
Name: count, dtype: int64
AP/PA
AP    169
PA     33
Name: count, dtype: int64


In [18]:
train_metadata['AP/PA'] = train_metadata['AP/PA'].replace({'AP': int(0), 'PA': int(1)})
valid_metadata['AP/PA'] = valid_metadata['AP/PA'].replace({'AP': int(0), 'PA': int(1)})

In [19]:
print(train_metadata['AP/PA'].value_counts())
print(valid_metadata['AP/PA'].value_counts())

AP/PA
0    161590
1     29419
Name: count, dtype: int64
AP/PA
0.0    169
1.0     33
Name: count, dtype: int64


In [20]:
train_metadata.to_csv(data_path.joinpath("train_ap_pa_int.csv"), index=False)
valid_metadata.to_csv(data_path.joinpath("valid_ap_pa_int.csv"), index=False)

In [21]:
new_train_metadata_path = data_path.joinpath("train_ap_pa_int.csv")
new_valid_metadata_path = data_path.joinpath("valid_ap_pa_int.csv")

new_train_metadata = pd.read_csv(new_train_metadata_path)
new_valid_metadata = pd.read_csv(new_valid_metadata_path)

print(new_train_metadata['AP/PA'].value_counts())
print(new_valid_metadata['AP/PA'].value_counts())

AP/PA
0    161590
1     29419
Name: count, dtype: int64
AP/PA
0.0    169
1.0     33
Name: count, dtype: int64
