## Mount Drive and Download Dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
!unzip -uq '/content/gdrive/My Drive/chexpertdataset.zip'

error:  zipfile read error


## Import Basic Packages

In [None]:
import cv2
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt

## Read and clean the  DataFrame

In [None]:
df = pd.read_csv('CheXpert-v1.0-small/train.csv') # Earlier, df_train
df_val = pd.read_csv('CheXpert-v1.0-small/valid.csv')

FileNotFoundError: ignored

In [None]:
def clean_df(df):
  # Focusing only on 5 classes:
  df = df[[
    'Path', 
    'Atelectasis',
    'Cardiomegaly',
    'Consolidation',
    'Edema',
    'Pleural Effusion'
  ]]

  # Handling the NaN values
  df = df.fillna(0)

  # Handling the uncertain values
  ## Different policy for each feature:
  u_ones = ['Atelectasis', 'Edema']
  u_zeros = ['Cardiomegaly', 'Consolidation', 'Pleural Effusion']
  df[u_ones]  = df[u_ones].replace(-1, 1)
  df[u_zeros] = df[u_zeros].replace(-1, 0)

  return df

In [None]:
df = df[[
  'Path', 
  'Atelectasis',
  'Cardiomegaly',
  'Consolidation',
  'Edema',
  'Pleural Effusion'
]]

# Handling the NaN values
df = df.fillna(0)

# Handling the uncertain values
## Different policy for each feature:
u_ones = ['Atelectasis', 'Edema']
u_zeros = ['Cardiomegaly', 'Consolidation', 'Pleural Effusion']
df[u_ones]  = df[u_ones].replace(-1, 1)
df[u_zeros] = df[u_zeros].replace(-1, 0)

In [None]:
df = clean_df(df)
df_val = clean_df(df_val)

## Set a few constants and to-be-used Class names

In [None]:
BATCH_SIZE = 32
IMAGE_SIZE = 224
CLASSES = [ 
  'Atelectasis',
  'Cardiomegaly',
  'Consolidation',
  'Edema',
  'Pleural Effusion'
]

In [None]:
FRAC = 0.003 # Fraction of total data to be taken as sample
SHAPE = (320, 390, 3) # Common shape for featurewise centering & normalization

sample_paths = df['Path'].sample(frac=FRAC).to_numpy()
X_temp = np.array([np.array(cv2.imread(path, 1), dtype=float) for path in sample_paths])
X_sample = np.array([x for x in X_temp if x.shape == SHAPE])

## A custon preprocessing function to experiment with

We tried histogram equalization to mimic deeper constrasts in the image, but unfortunately, it didn't return results as we had expected, so we dropped this in the end.

In [None]:
def image_preprocess(img):
  img = cv2.convertScaleAbs(img)
  img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  dst = cv2.equalizeHist(img)
  dst = cv2.cvtColor(dst, cv2.COLOR_GRAY2RGB)
  dst = dst.astype(np.float64)
  return dst

## Set up Image Data Generator for Data Augmentation

In [None]:
from keras.preprocessing.image import ImageDataGenerator as IDG

datagen = IDG(
    rescale=1./255, 
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=0.1,
    zoom_range = 0.1,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split = 0.1,
    fill_mode = 'nearest',
)

datagen.fit(X_sample)

test_datagen = IDG(rescale=1./255)

In [None]:
def get_gen():
  train_gen = datagen.flow_from_dataframe(
      dataframe = df,
      #directory = '/content/CheXpert-v1.0-small/train',
      x_col = 'Path',
      y_col = CLASSES, #'classes',
      class_mode='raw',
      #validate_filenames = False,
      seed=42,
      shuffle=True,
      target_size=(IMAGE_SIZE, IMAGE_SIZE), 
      batch_size=BATCH_SIZE, 
      subset = 'training'
  )

  val_gen = datagen.flow_from_dataframe(
      dataframe = df,
      #directory = '/content/CheXpert-v1.0-small/train',
      x_col = 'Path',
      y_col = CLASSES, #'classes',
      class_mode='raw',
      #validate_filenames = False,
      seed=42,
      shuffle=True,
      target_size=(IMAGE_SIZE, IMAGE_SIZE), 
      batch_size=BATCH_SIZE, 
      #classes = columns,
      subset = 'validation'
  )

  return train_gen, val_gen

## Build the Model

In [None]:
# Building on top of the base:
from keras.applications import DenseNet121
from keras.models import Sequential
from keras.layers import BatchNormalization, Conv2D, GlobalAveragePooling2D
from keras.layers.core import Flatten, Dense, Dropout

def build_model():
  # The convolutional base:
  model_base = DenseNet121(
      weights='imagenet', include_top=False, input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)
      )
  #model_base.trainable = False
  # Unfreezing all the layers:
  for layer in model_base.layers:
      layer.trainable = True

  model = Sequential()
  model.add(model_base) # Adding the base as a layer
  model.add(GlobalAveragePooling2D())
  model.add(Dense(1024, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.3))
  #model.add(Flatten())
  #model.add(Dense(1024, activation='relu'))
  #model.add(Dropout(0.25))
  model.add(Dense(5, activation='sigmoid'))
  
  return model

In [None]:
from keras.metrics import AUC, categorical_accuracy as catacc
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

auc = AUC()
adam = Adam(learning_rate=0.0001) # 0.05 of default

es = EarlyStopping(monitor='val_categorical_accuracy', mode='max', verbose=1, patience=2)
mc = ModelCheckpoint(
    filepath='densenet121-keras-2.h5', verbose=1 #, save_best_only=True
)

cb_list = [es, mc] # Will add clr later, as we'll have to tune it's hyperparameters

model = build_model()
model.compile(
    loss='binary_crossentropy',
    optimizer=adam,
    metrics=[auc, catacc] # Earlier, 'acc' 
)

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
densenet121 (Model)          (None, 7, 7, 1024)        7037504   
_________________________________________________________________
global_average_pooling2d_2 ( (None, 1024)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
batch_normalization_2 (Batch (None, 1024)              4096      
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 5125      
Total params: 8,096,325
Trainable params: 8,010,629
Non-trainable params: 85,696
_______________________________________

In [None]:
train_gen, val_gen = get_gen()

Found 201073 validated image filenames.
Found 22341 validated image filenames.


Set train_steps, validation_steps and number of epochs and let the training begin !

In [None]:
# Training constants:
TRAIN_STEPS = train_gen.n//BATCH_SIZE
VAL_STEPS   = val_gen.n//BATCH_SIZE
N_EPOCHS = 5

In [None]:
TRAIN_STEPS

6283

In [None]:
history = model.fit_generator(
    train_gen,
    steps_per_epoch=TRAIN_STEPS,
    epochs=N_EPOCHS,
    validation_data=val_gen,
    validation_steps=VAL_STEPS,
    callbacks = cb_list
)

Epoch 1/5

Epoch 00001: saving model to densenet121-keras-2.h5
Epoch 2/5

Epoch 00002: saving model to densenet121-keras-2.h5
Epoch 3/5

Epoch 00003: saving model to densenet121-keras-2.h5
Epoch 4/5

Google Colab Runtime limitations :(

  ## Create test generator and plot ROCAUCs for our model !

In [None]:
test_gen = test_datagen.flow_from_dataframe(
    dataframe = df_val,
    #directory = '/content/CheXpert-v1.0-small/valid',
    x_col = 'Path',
    y_col = CLASSES, #'classes',
    class_mode='raw',
    #validate_filenames = False,
    target_size=(IMAGE_SIZE, IMAGE_SIZE), 
    batch_size=1, 
    shuffle = False,
    #classes = columns,
)

In [None]:
y_labels = df_val[CLASSES].to_numpy()
y_pred = model.predict_generator(test_gen, steps=test_gen.n) 

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')

for i in range(len(CLASSES)):
   fpr, tpr, thresholds = roc_curve(y_labels[:, i], y_pred[:, i])
   individual_auc = auc(fpr, tpr)
   plt.plot(fpr, tpr, label= (CLASSES[i] + '(area = {})'.format(individual_auc)))

    
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

![alt text](https://i.ibb.co/jbXfhPs/densenet121-rescale.png)