# Author: Kristian

In [61]:
import os 
import datetime
from pathlib import Path, PurePath, PosixPath
from dotenv import load_dotenv, find_dotenv

import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np
from skimage.transform import resize

import sys 
from pathlib import Path
from dotenv import load_dotenv, find_dotenv

basepath = Path(os.getcwd())
# make sure your working directory is the repository root.
if basepath.name != "idp-radio-1":
    os.chdir(basepath.parent.parent.parent)
load_dotenv(find_dotenv())

%load_ext autoreload
%autoreload 2

from src.preprocessing.split.train_test_split import train_test_split
from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
# Load labels
DATASET_FOLDER = Path(os.environ.get('CHEXPERT_DEV_DATASET_DIRECTORY'))
data = pd.read_csv(DATASET_FOLDER / 'train.csv', index_col=[0])
data = data.fillna(0.0)
data = data

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,


In [63]:
labels = data.columns[5:19].to_list()

In [64]:
train, test = train_test_split(data, test_size=0.2, labels=labels)

Exception: The column patient_id does not exist

In [65]:
train_size = train.size / data.size
test_size = test.size / data.size
print('Train split size: ', train_size)
print('Test split size: ', test_size)
print(train.shape)
print(test.shape)

Train split size:  0.8238119636223902
Test split size:  0.17618803637760982
(12863, 20)
(2751, 20)


In [66]:

d = {'Pathology': [], 'Positive %': [], 'Uncertain %': [], 'Negative %': []}
for label in labels:
    values = train.groupby(label)
    d['Pathology'].append(label)

    positive = values.size()[1.0] if 1.0 in values.size() else 0
    positive_percent = positive / train.shape[0] * 100
    d['Positive %'].append(round(positive_percent))

    uncertain = values.size()[-1.0] if -1.0 in values.size() else 0
    uncertain_percent = uncertain / train.shape[0] * 100
    d['Uncertain %'].append(round(uncertain_percent))

    negative = values.size()[-0.0] if -0.0 in values.size() else 0
    negative_percent = negative / train.shape[0] * 100
    d['Negative %'].append(round(negative_percent))

d_val = {'Pathology': [], 'Positive %': [], 'Uncertain %': [], 'Negative %': []}
for label in labels:
    values = test.groupby(label)
    d_val['Pathology'].append(label)

    positive = values.size()[1.0] if 1.0 in values.size() else 0
    positive_percent = positive / test.shape[0] * 100
    d_val['Positive %'].append(round(positive_percent))

    uncertain = values.size()[-1.0] if -1.0 in values.size() else 0
    uncertain_percent = uncertain / test.shape[0] * 100
    d_val['Uncertain %'].append(round(uncertain_percent))

    negative = values.size()[-0.0] if -0.0 in values.size() else 0
    negative_percent = negative / test.shape[0] * 100
    d_val['Negative %'].append(round(negative_percent))

In [67]:
df = pd.DataFrame(d)
df = df.set_index('Pathology')

df_val = pd.DataFrame(d_val)
df

Unnamed: 0_level_0,Positive %,Uncertain %,Negative %
Pathology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No Finding,9.0,0.0,91.0
Enlarged Cardiomediastinum,5.0,6.0,89.0
Cardiomegaly,14.0,3.0,83.0
Lung Opacity,49.0,2.0,49.0
Lung Lesion,4.0,1.0,95.0
Edema,24.0,6.0,70.0
Consolidation,7.0,12.0,81.0
Pneumonia,3.0,9.0,89.0
Atelectasis,15.0,14.0,70.0
Pneumothorax,9.0,1.0,89.0


In [68]:
df_val

Unnamed: 0,Pathology,Positive %,Uncertain %,Negative %
0,No Finding,15.0,0.0,85.0
1,Enlarged Cardiomediastinum,5.0,6.0,89.0
2,Cardiomegaly,11.0,4.0,85.0
3,Lung Opacity,45.0,3.0,52.0
4,Lung Lesion,4.0,1.0,95.0
5,Edema,21.0,6.0,72.0
6,Consolidation,6.0,13.0,81.0
7,Pneumonia,3.0,9.0,87.0
8,Atelectasis,16.0,17.0,67.0
9,Pneumothorax,8.0,2.0,91.0


In [69]:
train = df[["Positive %","Uncertain %","Negative %"]].to_numpy()
val = df_val[["Positive %","Uncertain %","Negative %"]].to_numpy()
epsilon = 1e-8
relative_difference = pd.DataFrame(np.around((train - val) / (train+epsilon), decimals=2), columns=["Positive %","Uncertain %","Negative %"])

relative_difference["Pathology"] = df_val["Pathology"]
relative_difference

Unnamed: 0,Positive %,Uncertain %,Negative %,Pathology
0,-0.67,0.0,0.07,No Finding
1,0.0,0.0,0.0,Enlarged Cardiomediastinum
2,0.21,-0.33,-0.02,Cardiomegaly
3,0.08,-0.5,-0.06,Lung Opacity
4,0.0,0.0,0.0,Lung Lesion
5,0.12,0.0,-0.03,Edema
6,0.14,-0.08,0.0,Consolidation
7,0.0,0.0,0.02,Pneumonia
8,-0.07,-0.21,0.04,Atelectasis
9,0.11,-1.0,-0.02,Pneumothorax
