In [72]:
# Install openyxl for xlsx files since Pandas no longer supports them natively
!pip install openpyxl

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import itertools
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import load_img,img_to_array

from keras.utils import np_utils
from keras.models import Model, Sequential, load_model
from keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Reshape, Dropout
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [73]:
img_dir = '../input/ocular-disease-recognition-odir5k/preprocessed_images'

In [74]:
data = pd.read_csv('../input/ocular-disease-recognition-odir5k/full_df.csv')
data[:10]

In [75]:
data2 = data.iloc[:,7:15]
data2

In [76]:
data.describe()

In [None]:
from collections import Counter


In [77]:
#Selecting columns from ID to Right Diagnostic keywords

In [78]:
data_copy = data.iloc[:,0:7]
data_copy.head(5)

In [79]:
#Checking for word match 'cataract' in Left-Diagnostic Keywords

In [80]:
data_left = data_copy[data_copy['Left-Diagnostic Keywords'].str.match('cataract')]
data_left

In [81]:
#Checking for word match 'cataract' in Right-Diagnostic Keywords

In [82]:
data_right = data_copy[data_copy['Right-Diagnostic Keywords'].str.match('cataract')]
data_right

In Cataract problem, most of the cases are having cataract for one eye but normal for the other eye. 

In [83]:
#Combine both data
data_cataract = data_right['Right-Fundus'].append(data_left['Left-Fundus'], ignore_index = True)
data_cataract

In [84]:
len(data_cataract)

check few samples

In [85]:
data_cataract[141]

In [86]:
img = data_cataract[141]
image = cv2.imread(os.path.join(img_dir,img))
plt.imshow(image)
print(image.shape)
print(img)

In [87]:
plt.figure(figsize = (6,6))
for i in range(9):
    img = data_cataract[i]
    image = cv2.imread(os.path.join(img_dir,img))
    rgb_image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    plt.subplot(3,3,i+1)
    plt.imshow(rgb_image)
    plt.xlabel('Filename:{}\n''Cataract'.format(data_cataract[i]))
    plt.tight_layout()

Make a dataframe for normal images

In [88]:
data_ln = data_copy[data_copy['Left-Diagnostic Keywords'].str.match('normal')]
data_ln

In [89]:
data_rn = data_copy[data_copy['Right-Diagnostic Keywords'].str.match('normal')]
data_rn

In [93]:
#combine both
data_normal = data_rn['Right-Fundus'].append(data_ln['Left-Fundus'],ignore_index = True)
data_normal

In [94]:
len(data_normal)

In [95]:
data_normal_rand = data_normal.sample(n=494)
data_normal_rand.head()
#since glaucoma cases are 494, taking 650 normal cases

In [96]:
# Visualise few samples
plt.figure(figsize=(6,6))
for i in range(9):
    img = data_normal[i]
    image = cv2.imread(os.path.join(img_dir,img))
    rgb_image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    plt.subplot(3,3,i+1)
    plt.imshow(rgb_image)
    plt.xlabel('Filename: {}\n''Normal'.format(data_normal[i]))
    plt.tight_layout()

In [97]:
print(type(data_normal_rand))

Change both lists of normal and cataract to dataframes

In [98]:
cataract_df = pd.DataFrame(data_cataract, columns =['images'])
cataract_df

In [99]:
#add label column as 'Glaucoma'
cataract_df['label'] = 'cataract'
cataract_df

In [100]:
normal_df = pd.DataFrame(data_normal_rand,columns =['images'])
normal_df

In [101]:
#add label column as 'normal'
normal_df['label'] = 'normal'
normal_df

In [102]:
#combine both
df_combined = cataract_df.append(normal_df, ignore_index= True)
df_combined

Prior to feeding this organized set of cataract and normal images we need to randomize the rows within so that when we train we will train from a random pool of samples.

If we append, we end up adding a column for the old index values, which we don't want any new columns, so we'll drop the old index and allow the new dataframe to have a new one

In [103]:
df = df_combined.sample(frac=1).reset_index(drop=True)
df

In [104]:
# pull 80% of the combined dataset and reserve it for the training data
# the data generator will automatically create a validation set for us later

df_train = df.sample(frac=0.8,random_state=42)
df_train.reset_index(drop=True)

# exclude the 80% that was already chosen, the remaining 20% will go into testing
df_test = df.drop(df_train.index)
df_test.reset_index(drop=True)

print(len(df))
print(len(df_train))
print(len(df_test))

In [105]:
train_datagen=tf.keras.preprocessing.image.ImageDataGenerator(
            rescale=1./255.,
            validation_split=0.20,
            rotation_range=90,
#            width_shift_range=0.2,
#            height_shift_range=0.2,
            horizontal_flip=True,
            vertical_flip=True,
            shear_range=0.2,
            brightness_range=[0.3,1]    
#            zoom_range=0.2
            )

## for testing we don't want to do too much augmentation, we'll just scale it.

test_datagen=ImageDataGenerator(rescale=1./255.)

In [106]:
df_train['label'] = df_train['label'].astype(str)
df_test['label'] = df_test['label'].astype(str)

In [107]:
img_size =224

In [108]:
train_generator=train_datagen.flow_from_dataframe(
dataframe=df_train,
directory=img_dir,
x_col="images",
y_col="label",
subset="training",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(img_size,img_size))

## validation set is created from the training set, 
## we set it at 20% of the training data in the previous code

valid_generator=train_datagen.flow_from_dataframe(
dataframe=df_train,
directory=img_dir,
x_col="images",
y_col="label",
subset="validation",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(img_size,img_size))



test_generator=test_datagen.flow_from_dataframe(
dataframe=df_test,
directory=img_dir,
x_col="images",
y_col="label",
batch_size=32,
#seed=42,
shuffle=False,
class_mode="categorical",
target_size=(img_size,img_size))

In [109]:
train_image_data, train_labels = train_generator.next()
#train_image_data[0]
train_image_data.shape

In [110]:
train_labels[0]

In [111]:

#from imblearn.over_sampling import SMOTE
#x_train = pd.DataFrame(X_train)
#X_resample, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [112]:
# get VGG16 base model
vgg16 = keras.applications.vgg16.VGG16(input_shape=(224, 224, 3),
                                       weights='imagenet',
                                       include_top=False)

# add new dense layers at the top
x = keras.layers.Flatten()(vgg16.output)
x = keras.layers.Dense(1024, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(128, activation='relu')(x)

## remember we are using 2 outputs only
predictions = keras.layers.Dense(2, activation='softmax')(x)

# define and compile model
model = keras.Model(inputs=vgg16.inputs, outputs=predictions)
for layer in vgg16.layers:
    layer.trainable = False
    
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [113]:
checkpoint = ModelCheckpoint("cataract_vgg16.h5", 
                             monitor='val_accuracy', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=False, 
                             mode='auto', 
                             period=1)

early = EarlyStopping(monitor='val_accuracy', 
                      min_delta=0, 
                      patience=3, 
                      verbose=1, 
                      mode='auto')


In [114]:
batch_size = 32
n_spe = train_generator.samples // batch_size
n_val_steps = valid_generator.samples // batch_size
n_epochs = 30

print(n_spe,n_val_steps)

In [115]:
hist = model.fit(train_generator,
                        steps_per_epoch=n_spe,
                        validation_data=valid_generator,
                        validation_steps=n_val_steps,
                        epochs=n_epochs,
                        shuffle=True,
                        workers=5,
                        use_multiprocessing=True,
                        callbacks=[checkpoint,early])

In [None]:
from keras.models import load_model
model = load_model('../input/cataract_vgg16.h5"')


In [116]:
plt.plot(hist.history["accuracy"])
plt.plot(hist.history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Accuracy","Validation Accuracy"])
plt.show()

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["loss","Validation Loss"])
plt.show()

In [117]:
test_generator.reset()

In [118]:
pred = model.predict_generator(test_generator,verbose=1,steps=test_generator.samples/batch_size)


In [119]:
print(pred[0:10])
predicted_class_idx=np.argmax(pred,axis=1)

## print the same 10 rows

print(predicted_class_idx[0:25])

In [120]:
print(len(predicted_class_idx))

In [121]:
model.evaluate(test_generator,use_multiprocessing=True,workers=10)

In [122]:
valid_generator.class_indices.items()

In [123]:
valid_labels = dict((value,key) for key,value in valid_generator.class_indices.items())
pred_labels = [valid_labels[key] for key in predicted_class_idx]
pred_labels[1:25]

In [124]:
filenames = test_generator.filenames
prediction_df = pd.DataFrame({'Filename': filenames,'Prediction': pred_labels})
prediction_df.head()

In [125]:
prediction_df.iloc[35]

In [126]:
print(test_generator.filenames[35])
print(test_generator.labels[35])

In [127]:
test_file_names=test_generator.filenames  # sequential list of name of test files of each sample
test_labels=test_generator.labels # is a sequential list  of test labels for each image sample
class_dict= test_generator.class_indices # a dictionary where key is the class name and value is the label for the class

print (class_dict) # have a look at the dictionary

new_dict={} 

for key in class_dict: # set key in new_dict to value in class_dict and value in new_dict to key in class_dict
    value = class_dict[key]
    new_dict[value] = key

print('  RESULT  PREDICT      TRUE CLASS       FILENAME ' ) # adjust spacing based on your class names

for i, p in enumerate(pred):
    pred_index=np.argmax(p) # get the index that has the highest probability
    pred_class=new_dict[pred_index]  # find the predicted class based on the index
    true_class=new_dict[test_labels[i]] # use the test label to get the true class of the test file
    file=test_file_names[i]
    
    if true_class == pred_class:
        result = "Correct"
    else:
        result = "Wrong  "
    
    
    print(f' {result}   {pred_class}    {true_class}      {file}')

In [128]:
x_test, y_test = test_generator.next()

In [129]:
print(len(x_test))
print(len(y_test))


## compare this length to our prediction data and notice the difference.  

print(len(pred))

In [130]:
loss,accuracy = model.evaluate(x_test,y_test)
print("loss:",loss)
print("Accuracy:",accuracy)

In [131]:
test_image_data, test_labels = test_generator.next()

In [132]:
print(test_image_data.shape)
print(test_labels.shape)

In [133]:
z = 0
#test filename

test_file_names=test_generator.filenames[z]
print(test_file_names)

test_labels_example=test_generator.labels[z]
print(test_labels_example)

pred_labels[z]

In [134]:
test_labels[5]

In [135]:
test_class_idx=np.argmax(test_labels,axis=1)
#test_class_idx[4]
test_class_idx

In [136]:
plt.figure(figsize=(12,6))
for i in range(18):
    sample = random.choice(range(test_generator.samples))
#    print(str(sample))
    img = test_generator.filenames[sample]
    image = cv2.imread(os.path.join(img_dir, img))
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    category = test_generator.labels[sample]
    pred_category = pred_labels[sample]

#    sample = random.choice(range(0,test_generator.samples))    
#    image = test_image_data[sample]
#    category = test_class_idx[sample]
#    pred_category = pred_labels[sample]
           
    
    if category== 1:
        label = "Normal"
    else:
        label = "Cataract"
        
    if pred_category== "normal":
        pred_label = "Normal"
    else:
        pred_label = "Cataract"

    if label == pred_label:
        result = "Correct"
    else:
        result = "Wrong"

        
    plt.subplot(3,6,i+1)
    plt.imshow(image_rgb, interpolation='nearest')
    plt.xlabel("Actual:{}\nPrediction:{}\nResult:{}\nF:{}\nRow:{}".format(label,
                                                                          pred_label,
                                                                          result,
                                                                          test_generator.filenames[sample],
                                                                          sample))
plt.tight_layout() 

In [138]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print('Classification Report')
target_names = ['Cataract', 'Normal']
print(classification_report(test_generator.classes, predicted_class_idx, target_names=target_names))

In [139]:
cm = confusion_matrix(test_generator.labels, predicted_class_idx)
print('Confusion Matrix')
cm

In [140]:
def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [142]:
cm_plot_labels = ['cataract','normal']

In [143]:
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')