# Exploratory Data Analysis

In [50]:
import warnings
warnings.filterwarnings('ignore')
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.vision import *
from torchvision.models import *
import yaml
import pandas as pd
import datetime
import wget

from sklearn.metrics import roc_auc_score

In [53]:
from fastai import *
from fastai.vision import *

In [None]:
path = download_data('http://download.cs.stanford.edu//deep//CheXpert-v1.0-small.zip')

# Data Acquisition & Data Source

### Data for this project is acquired from Stanford Machine Learning Group. Stanford ML group collected chest radiograph data from 2002 to 2017, in both patient and outpatient centers along with reports

In [54]:
train_df = pd.read_csv('F:/MRP/CheXpert-v1.0-small/CheXpert-v1.0-small/train.csv')
train_df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small\train\patient00001\study1\...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0-small\train\patient00002\study2\...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0-small\train\patient00002\study1\...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0-small\train\patient00002\study1\...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0-small\train\patient00003\study1\...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,


# Data comprises of 223414 row and 19 columns or attributes

In [55]:
full_train_df.shape

(223414, 19)

In [56]:
full_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223414 entries, 0 to 223413
Data columns (total 19 columns):
Path                          223414 non-null object
Sex                           223414 non-null object
Age                           223414 non-null int64
Frontal/Lateral               223414 non-null object
AP/PA                         191027 non-null object
No Finding                    22381 non-null float64
Enlarged Cardiomediastinum    44839 non-null float64
Cardiomegaly                  46203 non-null float64
Lung Opacity                  117778 non-null float64
Lung Lesion                   11944 non-null float64
Edema                         85956 non-null float64
Consolidation                 70622 non-null float64
Pneumonia                     27608 non-null float64
Atelectasis                   68443 non-null float64
Pneumothorax                  78934 non-null float64
Pleural Effusion              133211 non-null float64
Pleural Other                 6492 non-n

In [57]:
train_df.describe()

Unnamed: 0,Age,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
count,223414.0,22381.0,44839.0,46203.0,117778.0,11944.0,85956.0,70622.0,27608.0,68443.0,78934.0,133211.0,6492.0,12194.0,123217.0
mean,60.430653,1.0,-0.035795,0.409346,0.848911,0.644508,0.456769,-0.183498,-0.461134,-0.005304,0.20654,0.559706,0.134011,0.688699,0.93268
std,17.820925,0.0,0.718442,0.769323,0.472571,0.691607,0.741785,0.75398,0.828249,0.990244,0.493529,0.648859,0.966183,0.565435,0.283377
min,0.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,49.0,1.0,-1.0,0.0,1.0,1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,1.0
50%,62.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,-1.0,0.0,0.0,1.0,1.0,1.0,1.0
75%,74.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
max,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [58]:
Chexpert_Targets_Features = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion']

In [59]:
u_one_features = ['Atelectasis', 'Edema']
u_zero_features = ['Cardiomegaly', 'Consolidation', 'Pleural Effusion']

### Add target features string

In [60]:
def feature_string(row):
    feature_list = []
    for feature in u_one_features:
        if row[feature] in [-1,1]:
            feature_list.append(feature)
            
    for feature in u_zero_features:
        if row[feature] == 1:
            feature_list.append(feature)
            
    return ';'.join(feature_list)

In [65]:
train_df['train_valid'] = False
valid_df['train_valid'] = True


### Add Patient and Study columns Dataframe

In [None]:
train_df['patient'] = train_df.Path.str.split('\',3,True)[2]
train_df  ['study'] = train_df.Path.str.split('\',4,True)[3]

valid_df['patient'] = valid_df.Path.str.split('\',3,True)[2]
valid_df  ['study'] = valid_df.Path.str.split('\',4,True)[3]

In [None]:
full_df = pd.concat([full_train_df, full_valid_df])
print(full_df)
full_df.shape

In [None]:
full_df['feature_string'] = full_df.apply(feature_string,axis = 1).fillna('')
#full_df.drop(full_df['feature_string'], axis=1, inplace=True)
print(full_df.shape)

In [63]:
full_train_df.info()
full_df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223414 entries, 0 to 223413
Data columns (total 20 columns):
Path                          223414 non-null object
Sex                           223414 non-null object
Age                           223414 non-null int64
Frontal/Lateral               223414 non-null object
AP/PA                         191027 non-null object
No Finding                    22381 non-null float64
Enlarged Cardiomediastinum    44839 non-null float64
Cardiomegaly                  46203 non-null float64
Lung Opacity                  117778 non-null float64
Lung Lesion                   11944 non-null float64
Edema                         85956 non-null float64
Consolidation                 70622 non-null float64
Pneumonia                     27608 non-null float64
Atelectasis                   68443 non-null float64
Pneumothorax                  78934 non-null float64
Pleural Effusion              133211 non-null float64
Pleural Other                 6492 non-n

(223648, 23)

### Sample Data Preparation

# Create DataBlock 

In [36]:
from pathlib import Path
#data_path= ("F:/MRP/CheXpert-v1.0-small/CheXpert-v1.0-small")
data_path= Path('F:\MRP\CheXpert-v1.0-small')
print(data_path)
#fn_paths = [data_path/name for name in df['name']]; fn_paths[:2]
def get_src(df = full_df):
    return (ImageList
        .from_df(df, data_path, 'Path')
        .split_from_df('train_valid')
        .label_from_df('feature_string',label_delim=';'))
    #return (ImageList.from_csv(path=data_path,
                             #csv_name='df.csv',
                              #folder='train',
                              #suffix='.jpg'
                             #))                   
                                   
def get_data(size, src, bs=32):
    #tfms = get_transforms(do_flip=False)
    #data = (src.transform(tfms, size=size).databunch(bs=bs).normalize())
    return (src.transform(get_transforms(do_flip=False)).databunch(bs=bs).normalize(imagenet_stats))


F:\MRP\CheXpert-v1.0-small
