In [3]:
!pip install -r ../requirements.txt

Collecting absl-py==0.9.0
  Using cached absl-py-0.9.0.tar.gz (104 kB)
Collecting appnope==0.1.0
  Using cached appnope-0.1.0-py2.py3-none-any.whl (4.0 kB)
Collecting arrow==0.15.6
  Using cached arrow-0.15.6-py2.py3-none-any.whl (47 kB)
Collecting attrs==19.3.0
  Using cached attrs-19.3.0-py2.py3-none-any.whl (39 kB)
Collecting awscli==1.18.46
  Using cached awscli-1.18.46-py2.py3-none-any.whl (3.0 MB)
Collecting binaryornot==0.4.4
  Using cached binaryornot-0.4.4-py2.py3-none-any.whl (9.0 kB)
Collecting bleach==3.1.4
  Using cached bleach-3.1.4-py2.py3-none-any.whl (151 kB)
Collecting botocore==1.15.46
  Using cached botocore-1.15.46-py2.py3-none-any.whl (6.1 MB)
Collecting cachetools==3.1.1
  Using cached cachetools-3.1.1-py2.py3-none-any.whl (11 kB)
Collecting certifi==2020.4.5.1
  Using cached certifi-2020.4.5.1-py2.py3-none-any.whl (157 kB)
Collecting click==7.1.2
  Using cached click-7.1.2-py2.py3-none-any.whl (82 kB)
Collecting cookiecutter==1.7.2
  Using cached cookiecutter-1.

In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
from pathlib import Path
from matplotlib import pyplot as plt
import os
from collections import defaultdict
import ast
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image

from keras import layers, Input
from keras.models import Model
from keras import optimizers
import keras
from keras import backend


%matplotlib inline

Using TensorFlow backend.


In [35]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

role = get_execution_role()

No handlers could be found for logger "sagemaker"


In [195]:
model_type = 'basic_convnet'

#### Before defining the model, we will define an fbeta metric that we will monitor which we will use as a proxy for the average AUROC across the 11 labels

In [196]:
def fbeta(y_true, y_pred, beta=2):
    # taken from https://machinelearningmastery.com/how-to-develop-a-convolutional-neural-network-to-classify-satellite-photos-of-the-amazon-rainforest/
    #clip predictions (incase our output layer is not bound to [0,1])
    y_pred = backend.clip(y_pred, 0, 1)
    # calculate tp, fp and fn for each class
    tp = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)), axis=1)
    fp = backend.sum(backend.round(backend.clip(y_pred - y_true, 0, 1)), axis=1)
    fn = backend.sum(backend.round(backend.clip(y_true - y_pred, 0, 1)), axis=1)
    # calculate precision
    p = tp / (tp + fp + backend.epsilon())
    # calculate recall
    r = tp / (tp + fn + backend.epsilon())
    # calculate fbeta, averaged across each class
    bb = beta ** 2
    fbeta_score = backend.mean((1 + bb) * (p * r) / (bb * p + r + backend.epsilon()))
    return fbeta_score

In [197]:
def create_new_model(input_dim, output_dim):
    input_tensor = Input(shape=(input_dim,input_dim,1))
    y = layers.Conv2D(32, (3,3), padding='same', activation='relu')(input_tensor)
    y = layers.MaxPooling2D(2, strides=2)(y)
    y = layers.Conv2D(32, (3,3), padding='same', activation='relu')(y)
    y = layers.MaxPooling2D(2, strides=2)(y)
    y = layers.Dropout(0.25)(y)
    
    y = layers.Conv2D(64, (3,3), padding='same', activation='relu')(y)
    y = layers.MaxPooling2D(2, strides=2)(y)
    y = layers.Conv2D(128, (3,3), padding='same', activation='relu')(y)
    y = layers.MaxPooling2D(2, strides=2)(y)
    y = layers.Dropout(0.25)(y)
    
    y = layers.Flatten()(y)
    y = layers.Dense(512, activation= 'relu')(y)
    y = layers.Dropout(0.5)(y)
    output_tensor = layers.Dense(output_dim, activation='sigmoid')(y)
    
    model = Model(input_tensor, output_tensor)
    
    
    model.compile(optimizers.rmsprop(lr=0.0001, decay=1e-6),
                 loss="binary_crossentropy", metrics = [fbeta])
    
    return model

    

In [198]:
raw_data_path = Path('/Users/Shrinikesh/Documents/personal-projects/kaggle/ranzcr_clip/data/raw')
raw_image_data_path = Path('/Users/Shrinikesh/Documents/personal-projects/kaggle/ranzcr_clip/data/raw/train')
models_dir = Path('/Users/Shrinikesh/Documents/personal-projects/kaggle/ranzcr_clip/models')
train_data_path = raw_data_path / 'train.csv'

In [199]:
train_df = pd.read_csv(train_data_path)

In [200]:
train_df.shape

(30083, 13)

#### We will drop PatientID for now as it is not included in test images. Perhaps we can incorporate the information later

#### Moreover, as we need the filenames in full to use the flow_from_dataframe function for training, we will append the extension to all the StudyInstanceUIDs (.jpg)

In [201]:
def append_ext(fn):
    return fn+".jpg"

In [202]:
del train_df['PatientID']

train_df['StudyInstanceUID'] = train_df['StudyInstanceUID'].apply(append_ext)
train_df.head()

Unnamed: 0,StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present
0,1.2.826.0.1.3680043.8.498.26697628953273228189...,0,0,0,0,0,0,1,0,0,0,0
1,1.2.826.0.1.3680043.8.498.46302891597398758759...,0,0,1,0,0,1,0,0,0,1,0
2,1.2.826.0.1.3680043.8.498.23819260719748494858...,0,0,0,0,0,0,0,0,1,0,0
3,1.2.826.0.1.3680043.8.498.68286643202323212801...,0,0,0,0,0,0,0,1,0,0,0
4,1.2.826.0.1.3680043.8.498.10050203009225938259...,0,0,0,0,0,0,0,0,0,1,0


In [203]:
class_names = list(train_df.columns)
class_names.remove('StudyInstanceUID')

#### We will create a class to index mapping so that the model will work irrespective of the order of the columns

In [204]:
class_mapping = {class_names[i]:i for i in range(len(class_names))}

In [205]:
class_mapping

{'ETT - Abnormal': 0,
 'ETT - Borderline': 1,
 'ETT - Normal': 2,
 'NGT - Abnormal': 3,
 'NGT - Borderline': 4,
 'NGT - Incompletely Imaged': 5,
 'NGT - Normal': 6,
 'CVC - Abnormal': 7,
 'CVC - Borderline': 8,
 'CVC - Normal': 9,
 'Swan Ganz Catheter Present': 10}

### We will use stratified K-Fold validation for training

In [206]:
# create a function to one hot encode each example's
# labels as an array using the mapping

def one_hot_encode(example_labels_dict, mapping=class_mapping):
    encoding = np.zeros(len(mapping), dtype='uint8')
    for label, value in example_labels_dict.items():
        if value:
            encoding[mapping[label]] = 1
    return encoding        
            
    
    
    

In [207]:
Y = train_df[class_names]

In [208]:
n_splits = 3

kf = KFold(n_splits = n_splits, random_state = 7, shuffle=True)

### Define percentage of overall data to use for training here (just as using all the data for training might take too long)

In [209]:
train_use_percent = 0.2

n_samples = int(np.ceil(train_df.shape[0]*train_use_percent))

In [210]:
n_samples

6017

#### We will use ImageDataGenerator to turn our images into batches of preprocessed training and validation images during each fold

In [211]:
idg = ImageDataGenerator(rescale=1./255)

#### We also need to save the best model during each fold, so will also create a function here that creates a model name for each fold

In [212]:
def get_model_name(k):
    return 'model_{}.h5'.format(str(k))

### MAIN TRAINING LOOP

In [187]:
VALIDATION_FBETA = []
VALIDATION_LOSS = []



logs_dir = models_dir / 'logs' / model_type
logs_dir.mkdir(parents=True, exist_ok=True)

save_dir = models_dir / model_type
save_dir.mkdir(parents=True, exist_ok=True)

fold_var = 1

input_dim = 256
output_dim = 11

history_log_dict = defaultdict(int)

for train_index, val_index in kf.split(np.zeros(n_samples),Y[:n_samples]):
    # get the data that will be used for training in this fold
    training_data = train_df.iloc[train_index]
    # get the data that will be used for validation in this fold
    validation_data = train_df.iloc[val_index]
    
    # now set up the generators to feed the data in batches to
    # the model during training
    
    train_data_generator = idg.flow_from_dataframe(training_data,
                                                  directory=raw_image_data_path,
                                                  x_col = 'StudyInstanceUID',
                                                  y_col=class_names,
                                                  target_size = (input_dim,input_dim),
                                                  color_mode='grayscale',
                                                  class_mode='raw',
                                                  batch_size=32,
                                                  shuffle=True,
                                                  seed=42)
    valid_data_generator = idg.flow_from_dataframe(validation_data,
                                                  directory=raw_image_data_path,
                                                  x_col = 'StudyInstanceUID',
                                                  y_col=class_names,
                                                  target_size = (input_dim,input_dim),
                                                  color_mode='grayscale',
                                                  class_mode='raw',
                                                  batch_size=32,
                                                  shuffle=True,
                                                  seed=42)
    
    
    model = create_new_model(input_dim, output_dim)
    
    model._get_distribution_strategy = lambda: None
    
    
    model_filepath = str(save_dir / get_model_name(fold_var))
    
    # Create callbacks below
    callbacks_list = [
        keras.callbacks.ModelCheckpoint(
        filepath=model_filepath,
        monitor="val_fbeta",
        save_best_only=True),
        keras.callbacks.TensorBoard(
        log_dir = logs_dir)
    ]
    
    # Fitting the model
    step_size_train = train_data_generator.n//train_data_generator.batch_size
    step_size_val = valid_data_generator.n//valid_data_generator.batch_size
    
    # fit_generator is deprecated so we can use fit
    history = model.fit(x=train_data_generator,
                    steps_per_epoch=step_size_train,
                    validation_data=valid_data_generator,
                    validation_steps=step_size_val,
                    callbacks=callbacks_list,
                    epochs=30)
    
    
    history_log_dict[fold_var] = history
    
    # now we will just locally load the best model from this fold
    # and evaluate on the validation set 
    
    model.load_weights(model_filepath)
    
    results = model.evaluate(valid_data_generator)
    results = dict(zip(model.metrics_names, results))
    
    VALIDATION_FBETA.append(results["fbeta"])
    VALIDATION_LOSS.append(results["loss"])
    
    
        
        
    
    

Found 2407 validated image filenames.
Found 602 validated image filenames.
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 