In [6]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import plotly.graph_objects as go
import cv2
from PIL import Image
from PIL import ImageFile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dense, Flatten , Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau,ModelCheckpoint 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True

### Data Analysis

In [None]:
root_dir = '../input/intel-mobileodt-cervical-cancer-screening'
train_dir = os.path.join(root_dir,'train', 'train')

type1_dir = os.path.join(train_dir, 'Type_1')
type2_dir = os.path.join(train_dir, 'Type_2')
type3_dir = os.path.join(train_dir, 'Type_3')

train_type1_files = glob.glob(type1_dir+'/*.jpg')
train_type2_files = glob.glob(type2_dir+'/*.jpg')
train_type3_files = glob.glob(type3_dir+'/*.jpg')

added_type1_files  =  glob.glob(os.path.join(root_dir, "additional_Type_1_v2", "Type_1")+'/*.jpg')
added_type2_files  =  glob.glob(os.path.join(root_dir, "additional_Type_2_v2", "Type_2")+'/*.jpg')
added_type3_files  =  glob.glob(os.path.join(root_dir, "additional_Type_3_v2", "Type_3")+'/*.jpg')


type1_files = train_type1_files + added_type1_files
type2_files = train_type2_files + added_type2_files
type3_files = train_type3_files + added_type3_files

print(f'''Type 1 files for training: {len(train_type1_files)} 
Type 2 files for training: {len(train_type2_files)}
Type 3 files for training: {len(train_type3_files)}''' )

print(f'''Added Type 1 files for training: {len(added_type1_files)} 
Added Type 2 files for training: {len(added_type2_files)}
Added Type 3 files for training: {len(added_type3_files)}''' )

print(f'''Type 1 files for training: {len(type1_files)} 
Type 2 files for training: {len(type2_files)}
Type 3 files for training: {len(type3_files)}''' )

In [None]:
# # create dataframe of file and labels
files = {'filepath': type1_files + type2_files + type3_files,
          'label': ['Type 1']* len(type1_files) + ['Type 2']* len(type2_files) + ['Type 3']* len(type3_files)}

files_df = pd.DataFrame(files).sample(frac=1, random_state= 1).reset_index(drop=True)
files_df

In [None]:
# display sample images of types
for label in ('Type 1', 'Type 2', 'Type 3'):
    filepaths = files_df[files_df['label']==label]['filepath'].values[:5]
    fig = plt.figure(figsize= (15, 6))
    for i, path in enumerate(filepaths):
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        img = cv2.resize(img, (224, 224))
        fig.add_subplot(1, 5, i+1)
        plt.imshow(img)
        plt.subplots_adjust(hspace=0.5)
        plt.axis(False)
        plt.title(label)

### Data Processing

In [13]:
#  split the data into train  and validation set
train_df, eval_df = train_test_split(files_df, test_size= 0.2, stratify= files_df['label'], random_state= 1)
val_df, test_df = train_test_split(eval_df, test_size= 0.5, stratify= eval_df['label'], random_state= 1)
print(len(train_df), len(val_df), len(test_df))

6572 821 822


In [14]:
# loads images from dataframe
def load_images(dataframe):
    features = []
    filepaths = dataframe['filepath'].values
    labels = dataframe['label'].values
    
    for path in filepaths:
        img = cv2.imread(path)
        if img is None:
            print(dataframe[dataframe['filepath'] == path])
            dataframe.drop(dataframe[dataframe['filepath'] == path].index, inplace = True)
            continue
        resized_img = cv2.resize(img, (180, 180))
        features.append(np.array(resized_img))
    return np.array(features), np.array( dataframe['label'].values)

In [15]:
# initially we loaded from dataset but later we saved it into pickle and loaded from it 
# pickle contained all the transformed low res images, while the files contained all the high res images 
# train_features, train_labels = load_images(train_df)
# val_features, val_labels = load_images(val_df)
# test_features, test_labels = load_images(test_df)

In [16]:
import pickle
train_features, train_labels = None, None
with open('/kaggle/input/dataset-project/dataset/train.pickle', 'rb') as handle:
    train_features, train_labels = pickle.load(handle)

val_features, val_labels = None, None
with open('/kaggle/input/dataset-project/dataset/val.pickle', 'rb') as handle:
    val_features, val_labels = pickle.load(handle)

test_features, test_labels = None, None
with open('/kaggle/input/dataset-project/dataset/test.pickle', 'rb') as handle:
    test_features, test_labels = pickle.load(handle)    

In [17]:
# import pickle 
# with open('test.pickle', 'wb') as handle:
#     pickle.dump((test_features, test_labels), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
# check lengths of training and evaluation  sets
len(train_features), len(train_labels), len(test_features), len(test_labels), len(test_features), len(test_labels) 

(6569, 6569, 822, 822, 822, 822)

In [19]:
# get image shape
InputShape = train_features[0].shape
print(InputShape)

(180, 180, 3)


In [20]:
# normalize the features
X_train = train_features/255
X_val  = val_features/255
X_test  = test_features/255

In [21]:
# encode the labels
le = LabelEncoder().fit(['Type 1', 'Type 2', 'Type 3'])
y_train = le.transform(train_labels)
y_val = le.transform(val_labels)
y_test = le.transform(test_labels)

In [22]:
# initialize image data generator for training and evaluation sets
import numpy as np 
import random
def add_noise(img):
    '''Add random noise to an image'''
    VARIABILITY = 50
    deviation = VARIABILITY*random.random()
    noise = np.random.normal(0, deviation, img.shape)
    img += noise
    np.clip(img, 0., 255.)
    return img

train_datagen = ImageDataGenerator(
                                rotation_range = 40,
                                zoom_range = 0.2,
                                width_shift_range=0.2,
                                height_shift_range=0.2,
                                shear_range=0.2,
                                horizontal_flip=True,
                                vertical_flip = True)

eval_datagen = ImageDataGenerator()

In [23]:
# apply data augmentation to features
BATCH_SIZE= 12
train_gen = train_datagen.flow(X_train, y_train, batch_size= BATCH_SIZE)
val_gen = eval_datagen.flow(X_val, y_val, batch_size= BATCH_SIZE)
test_gen = eval_datagen.flow(X_test, y_test, batch_size= BATCH_SIZE)

In [24]:
# show shape of each  batch
for data_batch, labels_batch in train_gen:
    print('data batch shape: {} \n labels batch shape: {}'.format(data_batch.shape, labels_batch.shape))
    break

data batch shape: (12, 180, 180, 3) 
 labels batch shape: (12,)


In [None]:
## Train set distribution 
labels = list(map(lambda x: int(x[-1]), train_labels))
plt.figure(figsize = (10, 10))
plt.bar(["Type 1", "Type 2", "Type  3"], [labels.count(1), labels.count(2), labels.count(3)])
# plt.title("")
# plt.xlabel()
plt.show()

### Model Training 

In [27]:
# initialize pretrained vgg model base
conv_base = VGG16(weights= 'imagenet', include_top= False, input_shape= (180, 180, 3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [28]:
# show trainable layers before freezing
print('This is the number of trainable weights '
'before freezing layers in the conv base:', len(conv_base.trainable_weights))

This is the number of trainable weights before freezing layers in the conv base: 26


In [29]:
# freeze few layers of pretrained model
for layer in conv_base.layers[:-5]:
    layer.trainable= False

In [30]:
# show trainable layers after freezing
print('This is the number of trainable weights '
'after freezing layers in the conv base:', len(conv_base.trainable_weights))

This is the number of trainable weights after freezing layers in the conv base: 6


In [31]:
# build model 
import keras
METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
#       keras.metrics.Precision(name='precision'),
#       keras.metrics.Recall(name='recall'),
 
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

model = Sequential([conv_base, 
                    Flatten(),
                    Dropout(0.5),
                    Dense(3, activation='softmax')])

In [32]:
from keras import backend as K
def precision(y_true, y_pred):
    """Precision metric.
    Only computes a batch-wise average of precision.
    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.
    Only computes a batch-wise average of recall.
    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [33]:
# compile model
model.compile(optimizer= Adam(0.0001), loss= 'sparse_categorical_crossentropy', metrics= ['accuracy'])

In [34]:
# show model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 5, 5, 512)         14714688  
_________________________________________________________________
flatten (Flatten)            (None, 12800)             0         
_________________________________________________________________
dropout (Dropout)            (None, 12800)             0         
_________________________________________________________________
dense (Dense)                (None, 3)                 38403     
Total params: 14,753,091
Trainable params: 7,117,827
Non-trainable params: 7,635,264
_________________________________________________________________


In [35]:
# define training steps
TRAIN_STEPS = len(train_labels)//BATCH_SIZE
VAL_STEPS = len(val_labels)//BATCH_SIZE

In [36]:
model.load_weights("/kaggle/input/cervical-weights/cervicalModel.weights.hdf5")

In [42]:
# initialize callbacks
reduceLR = ReduceLROnPlateau(monitor='val_loss', patience=10, verbose= 1, mode='min', factor=  0.2, min_lr = 1e-5)

early_stopping = EarlyStopping(monitor='val_accuracy', patience = 10, verbose=1, mode='max', restore_best_weights= True)

checkpoint = ModelCheckpoint('cervicalModel_weights.hdf5', monitor='val_accuracy', verbose=1,save_best_only=True, mode= 'max')

In [None]:
# train model
history = model.fit(train_gen, steps_per_epoch= TRAIN_STEPS, validation_data=val_gen, validation_steps=VAL_STEPS, epochs= 100,
                   callbacks= [checkpoint, early_stopping])

Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.73284, saving model to cervicalModel_weights.hdf5
Epoch 2/100

Epoch 00002: val_accuracy did not improve from 0.73284
Epoch 3/100

Epoch 00003: val_accuracy did not improve from 0.73284
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.73284
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.73284
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.73284
Epoch 7/100

Epoch 00007: val_accuracy did not improve from 0.73284
Epoch 8/100

Epoch 00010: val_accuracy did not improve from 0.73775
Epoch 11/100

Epoch 00011: val_accuracy improved from 0.73775 to 0.74877, saving model to cervicalModel_weights.hdf5
Epoch 12/100

Epoch 00012: val_accuracy did not improve from 0.74877
Epoch 13/100

Epoch 00013: val_accuracy did not improve from 0.74877


In [None]:
model.save_weights("cancer_screen_model_new.h5")

### Model Evaluation

In [41]:
model.load_weights("/kaggle/input/model-cervical/cervicalModel.weights.hdf5")
model.evaluate(test_gen)



[0.7035561800003052, 0.7360097169876099]

In [42]:
# model.load_weights("/kaggle/input/prev-final-model/cancer_screen_model.h5")
# model.save_weights("cancer_screen_model_new.h5")
model.load_weights("cancer_screen_model.h5")
# model.load_weights("cervicalModel_weights.hdf5")
# model.evaluate(test_gen)
prediction = list(map(lambda x: np.argmax(x), model.predict(test_gen)))
labels = list(map(lambda x: int(x[-1]) - 1, test_labels))

In [65]:
def evaluate_image(path, model):
    img = cv2.imread(path)
    resized_img = cv2.resize(img, (180, 180))
    resized_img = tf.convert_to_tensor(resized_img)
    return np.argmax(model.predict(tf.reshape(resized_img, (1, resized_img.shape[0], resized_img.shape[1], resized_img.shape[2])))[0]) + 1