**Diabetic RetinopathyDetection**

This Notebook aims to provide a prediction kernel using Transfer learning - Fine Tuned VGG-16 architecture.


In [None]:
#All Necessary Imports
import numpy as np
import os
import time
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.layers import Dense, Activation, Flatten
from keras.layers import merge, Input
from keras.models import Model
from keras.utils import np_utils
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


In [None]:
import os
print(os.listdir("../input"))

In [None]:

import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage.io
from skimage.transform import resize
from imgaug import augmenters as iaa
from tqdm import tqdm
import PIL
from PIL import Image, ImageOps
import cv2
from sklearn.utils import class_weight, shuffle
import keras.backend as K
import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import f1_score, fbeta_score
from keras.utils import Sequence
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

WORKERS = 2
CHANNEL = 3

import warnings
warnings.filterwarnings("ignore")
NUM_CLASSES = 5
SEED = 77
TRAIN_NUM = -1 # use 1000 when you just want to explore new idea, use -1 for full train

In [None]:
os.getcwd()

In [None]:
img_cols, img_rows =224,224

In [None]:
batch_size = 32
# number of output classes
nb_classes = 5
# number of epochs to train
nb_epoch = 20

**Model Selection**

We load the base model, which is a VGG-16 model pretrained on imagenet weights.
We then move on to freeze all the layers except the last three.

In [None]:
#loading base model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
#freeze_layers(base_model)
base_model.summary()
#model = Model(input=base_model.input, output=base_model.get_layer('fc1').output)


In [None]:
# Freeze the layers except the last 4 layers
for layer in base_model.layers[:-4]:
    layer.trainable = False
# Check the trainable status of the individual layers
for layer in base_model.layers:
    print(layer, layer.trainable)
base_model.summary()

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(base_model, to_file='base_model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(base_model).create(prog='dot', format='svg'))

Before we go about pre processing data and training our loaded model, we fix the following
* Batch Size of the data required for training
* nb_classes -> indicates the number of output classes
* nb_epoch -> induicates the number of iterations during training 

**Fine Tune : VGG-16**

We move on to add customised layers on top of our pre-loaded model for purpose of fine-tuning.
The following layers were added :
* Flatten
* Dense Relu
* Dropout
* Dense Softmax

In [None]:
from keras import models
from keras import layers
from keras import optimizers
 
# Create the model
model = models.Sequential()
 
# Add the vgg convolutional base model
model.add(base_model)
 
# Add new layers
model.add(Flatten())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(nb_classes, activation='softmax', name ='output'))
 
# Show a summary of the model. Check the number of trainable parameters
model.summary()


In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='finetune_model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
os.listdir('../input')

Loading the dataset

In [None]:
import pandas as pd
trainLabels =  pd.read_csv("../input/dr1000/cropped_resized/cropped.csv")
trainLabels.shape



**Finding class distribution**

In [None]:
import seaborn as sns
sns.countplot("level",data= trainLabels)

In [None]:
from collections import Counter
counter = Counter(trainLabels['level'])
print(counter)

The dataset is highly imbalanced with maximum data having severity 0, i.e., no Diabetic Retinopathy

In [None]:
import os
os.getcwd()

In [None]:
trainDir = "../input/dr1000/cropped_resized/resized_data"


In [None]:
def load_ben_color(trainDir, sigmaX=10):
    data = []
    images = os.listdir(trainDir)
    print("Number of files in new_dataset is " + str(len(images)))
    for imagefilename in images:
        imagefullpath = os.path.join(trainDir,imagefilename)
        image = cv2.imread(imagefullpath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
        image=cv2.GaussianBlur( image , (5,5) ,0)
        image = np.array(image, dtype="float") / 255.0
        data.append(np.array(image))    
    return data

In [None]:
data = load_ben_color(trainDir)
np.savez('../input/1000preprocessed224/new_224_preprocessed1000samples_imgarr.npz', *data)

In [None]:
container = np.load('../input/1000preprocessed224/new_224_preprocessed1000samples_imgarr.npz')
data = [container[key] for key in container]

In [None]:
images = os.listdir(trainDir)
imagelabels = []
for imagefilename in images:
  imagefilename = imagefilename.replace(".jpeg","")
  imagelabels.append(trainLabels.loc[trainLabels.image==imagefilename, 'level'].values[0])

In [None]:
from sklearn.utils import shuffle

#converting images & labels to numpy arrays
data = np.asarray(data)
imagelabels = np.asarray(imagelabels)

In [None]:
data= data.reshape(data.shape[0], img_cols*img_rows*3)

In [None]:
data.shape

In [None]:
img_data,Label = shuffle(data,imagelabels, random_state=2)
train_data = [img_data,Label]

In [None]:
(X, y) = (train_data[0],train_data[1])

In [None]:
from collections import Counter
counter = Counter(y)
print(counter)

In [None]:
from sklearn.model_selection import train_test_split

# STEP 1: split X and y into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,stratify =y, random_state=4)



print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Using smote to tackle class imbalance

In [None]:
from imblearn.over_sampling import SMOTE
#x_train = pd.DataFrame(X_train)
X_resample, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [None]:
#X_resample = X_resample.reshape(X_resample.shape[0], img_cols,img_rows,3)
X_test = X_test.reshape(X_test.shape[0], img_cols, img_rows, 3)

In [None]:
from collections import Counter
counter = Counter(y_resampled)
print(counter)

In [None]:
from keras.utils import np_utils

# convert class vectors to binary class matrices
#y_resampled = np_utils.to_categorical(y_resampled, NUM_CLASSES)
Y_test = np_utils.to_categorical(y_test, NUM_CLASSES)

In [None]:

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
#batch_size to train
batch_size = 32
# number of output classes
nb_classes = 5
# number of epochs to train
nb_epoch = 20

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
checkpoint_filepath = './my_best_model.epoch{epoch:02d}-val_auc{val_auc:.2f}.hdf5'

In [None]:

lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_auc", patience=3, min_lr=1e-6, mode='max')
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', min_delta=0.0001, patience=6, mode='max')

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_auc',
    mode='max',
    save_best_only=True)

In [None]:
history = model.fit(X_resample,y_resampled,epochs = 20, batch_size=32, validation_data=(X_test, Y_test),callbacks=[model_checkpoint_callback,lr_reducer,early_stop])


In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.savefig('model_accuracy.png')
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.savefig('model_loss.png')

In [None]:
from keras.models import load_model
#best model in 128model67%,epoch9-67%,epoch-7=66%
#model_path = './my_best_model.epoch07-loss1.02.hdf5'
model_path ='../input/128model67/my_best_model.epoch09-loss1.02.hdf5'

In [None]:
model = load_model(model_path)

In [None]:
score = model.evaluate(X_test, Y_test, verbose=0)
print(score)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
predict_x = model.predict(X_test) 
print(predict_x)

In [None]:
PREDICTION = np.argmax(predict_x,axis=1)

In [None]:
ACTUAL = np.argmax(Y_test, axis =1)
ACTUAL.shape

In [None]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(ACTUAL,PREDICTION)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(ACTUAL,PREDICTION, average='micro')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(ACTUAL,PREDICTION, average='micro')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(ACTUAL,PREDICTION, average='micro')
print('F1 score: %f' % f1)

In [None]:
probability = []
#map(lambda x: sorted(x[3], resultlist)
for i in predict_x:
    x = (sorted(i))
    #print(x)
    probability.append(round(x[-1]*100,0))
probability
    #probability.append()
    #for j in sorted(i):
        #probability.append(sorted(i))
#probability
        

In [None]:
grade_df = pd.DataFrame(ACTUAL, columns=["actual"])
grade_df.head(10)

In [None]:
grade_df["predicted"] = PREDICTION
grade_df.head(10)

In [None]:
grade_df.loc[grade_df['actual'] == grade_df['predicted'],'match'] = 'True'
grade_df.loc[grade_df['actual']!=grade_df['predicted'],'match'] = 'False'
grade_df

In [None]:
grade_df["probabilty_score"] = probability
grade_df

In [None]:
grade_df.to_csv('disease_grade.csv',index = True,header = True)

In [None]:
# Confusion matrix for actual and predicted values.
matrix = confusion_matrix(ACTUAL,PREDICTION, labels=[0,1,2,3,4])
print('Confusion matrix :')
print(matrix)

In [None]:
cm_df = pd.DataFrame(matrix,
                     index = ['NO_DR','MILD','MODERATE','SEVERE', 'PROLIFERATIVE_DR'], 
                     columns = ['NO_DR','MILD','MODERATE','SEVERE', 'PROLIFERATIVE_DR'], )

In [None]:
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()
