# Importing Libraries

In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
import plotly.graph_objects as go
import cv2
from PIL import Image
from PIL import ImageFile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dense, Flatten , Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau,ModelCheckpoint 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Loading Dataset 

In [None]:
# get the data for training
root_dir = '../input'
intel_dir = '../input/intel-mobileodt-cervical-cancer-screening'

type1_dir = os.path.join(root_dir, 'type2additional','Typ_1add')
type2_dir = os.path.join(root_dir, 'type2add','Type_2')
type3_dir = os.path.join(root_dir, 'type3addkjanjkanajnnkjhjj','Type_3_add')

train_type1_files = glob.glob(type1_dir+'/*.jpg')
train_type2_files = glob.glob(type2_dir+'/*.jpg')
train_type3_files = glob.glob(type3_dir+'/*.jpg')

added_type1_files  =  glob.glob(os.path.join(intel_dir, "additional_Type_1_v2", "Type_1")+'/*.jpg')
added_type2_files  =  glob.glob(os.path.join(intel_dir, "additional_Type_2_v2", "Type_2")+'/*.jpg')
added_type3_files  =  glob.glob(os.path.join(intel_dir, "additional_Type_3_v2", "Type_3")+'/*.jpg')



type1_files = train_type1_files
type2_files = train_type2_files
type3_files = train_type3_files

print('''Type 1 files for training: {len(type1_files)} 
Type 2 files for training: {len(type2_files)}
Type 3 files for training: {len(type3_files)}''' )

check folder wise for bad files 

In [None]:
#train_type1_files
# create dataframe of file and labels
train_type1_files = {'filepath': type1_files,
          'label': ['Type 1']* len(type1_files)}

train_type1_df = pd.DataFrame(train_type1_files).sample(frac=1, random_state= 1).reset_index(drop=True)
len(train_type1_df)

### Individually checked each directory and corrected the corrupted files to avoid error "premature ending of jpeg file"

### since the error was still coming, deleted the files identified manually.

In [None]:
dir_path = r'../input/type2additional/Typ_1add'

In [None]:
def detect_and_fix(img_path, img_name):
    # detect for premature ending
    try:
        with open( img_path, 'rb') as im :
            im.seek(-2,2)
            if im.read() == b'\xff\xd9':
                print('Image OK :', img_name) 
            else: 
                # fix image
                img = cv2.imread(img_path)
                cv2.imwrite( img_path, img)
                print('FIXED corrupted image :', img_name)           
    except(IOError, SyntaxError) as e :
      print(e)
      print("Unable to load/write Image : {} . Image might be destroyed".format(img_path) )


for path in os.listdir(dir_path):
    # Make sure to change the extension if it is nor 'jpg' ( for example 'JPG','PNG' etc..)
    if path.endswith('.jpg'):
      img_path = os.path.join(dir_path, path)
      detect_and_fix( img_path=img_path, img_name = path)
        

print("Process Finished")
    


In [None]:
dir_path = r'../input/type2add/Type_2'


In [None]:
def detect_and_fix(img_path, img_name):
    # detect for premature ending
    try:
        with open( img_path, 'rb') as im :
            im.seek(-2,2)
            if im.read() == b'\xff\xd9':
                print('Image OK :', img_name) 
            else: 
                # fix image
                img = cv2.imread(img_path)
                cv2.imwrite( img_path, img)
                print('FIXED corrupted image :', img_name)           
    except(IOError, SyntaxError) as e :
      print(e)
      print("Unable to load/write Image : {} . Image might be destroyed".format(img_path) )


for path in os.listdir(dir_path):
    # Make sure to change the extension if it is nor 'jpg' ( for example 'JPG','PNG' etc..)
    if path.endswith('.jpg'):
      img_path = os.path.join(dir_path, path)
      detect_and_fix( img_path=img_path, img_name = path)
        

print("Process Finished")

In [None]:
dir_path = r'../input/type3addkjanjkanajnnkjhjj/Type_3_add'

In [None]:
def detect_and_fix(img_path, img_name):
    # detect for premature ending
    try:
        with open( img_path, 'rb') as im :
            im.seek(-2,2)
            if im.read() == b'\xff\xd9':
                print('Image OK :', img_name) 
            else: 
                # fix image
                img = cv2.imread(img_path)
                cv2.imwrite( img_path, img)
                print('FIXED corrupted image :', img_name)           
    except(IOError, SyntaxError) as e :
      print(e)
      print("Unable to load/write Image : {} . Image might be destroyed".format(img_path) )


for path in os.listdir(dir_path):
    # Make sure to change the extension if it is nor 'jpg' ( for example 'JPG','PNG' etc..)
    if path.endswith('.jpg'):
      img_path = os.path.join(dir_path, path)
      detect_and_fix( img_path=img_path, img_name = path)

print("Process Finished")

Checked all 3 folders for corrupted files.They are all clean.

In [None]:
# create dataframe of file and labels
files = {'filepath': type1_files + type2_files + type3_files,
          'label': ['Type 1']* len(type1_files) + ['Type 2']* len(type2_files) + ['Type 3']* len(type3_files)}

files_df = pd.DataFrame(files).sample(frac=1, random_state= 1).reset_index(drop=True)
files_df['filepath'][10]
files_df

# Data Cleaning and Exploration

In [None]:
# describe the dataframe
files_df.describe()

In [None]:
# check for duplicates
len(files_df[files_df.duplicated(subset=['filepath'])])
#print(files_df['filepath'].duplicated().sum())

In [None]:
# check for damaged files
bad_files = []
for path in (files_df['filepath'].values):
    try:
        img = Image.open(path)
    except:
        index = files_df[files_df['filepath']==path].index.values[0]
        bad_files.append(index)
print(len(bad_files))

In [None]:
# # show the bad files
print(bad_files)
# drop the damaged files
files_df.drop(bad_files, inplace=True)

In [None]:
# check length of files in dataframe
len(files_df)

In [None]:
# check unique labels
files_df['label'].unique()

In [None]:
# get count of each type 
type_count = pd.DataFrame(files_df['label'].value_counts()).rename(columns= {'label': 'Num_Values'})
type_count

In [None]:
# display barplot of type count
plt.figure(figsize = (15, 6))
sns.barplot(x= type_count['Num_Values'], y= type_count.index.to_list())
plt.title('Cervical Cancer Type Distribution')
plt.grid(True)
plt.show()

### The data distribution plot shows that type 1 class has more datapoints than other types, with type 3 having the least datapoints.

### A pie plot is useful in visualizing the percentage of data distribution.


In [None]:
# display pieplot of label distribution
pie_plot = go.Pie(labels= type_count.index.to_list(), values= type_count.values.flatten(),
                 hole= 0.2, text= type_count.index.to_list(), textposition='auto')
fig = go.Figure([pie_plot])
fig.update_layout(title_text='Pie Plot of Type Distribution')
fig.show()

In [None]:
# display sample images of types
for label in ('Type 1', 'Type 2', 'Type 3'):
    filepaths = files_df[files_df['label']==label]['filepath'].values[:5]
    fig = plt.figure(figsize= (15, 6))
    for i, path in enumerate(filepaths):
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        img = cv2.resize(img, (224, 224))
        fig.add_subplot(1, 5, i+1)
        plt.imshow(img)
        plt.subplots_adjust(hspace=0.5)
        plt.axis(False)
        plt.title(label)

# Data Processing

In [None]:

def load_images(dataframe):
    features = []
    filepaths = dataframe['filepath'].values
    labels = dataframe['label'].values
    
    for path in filepaths:
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        img = cv2.resize(img, (224, 224))
        img = cv2.GaussianBlur(img , (5,5) ,0)
        features.append(np.array(img))
    return np.array(features), np.array(labels)

In [None]:
#  split the data into train  and validation set
train_df, eval_df = train_test_split(files_df, test_size= 0.3, stratify= files_df['label'], random_state= 1)
val_df, test_df = train_test_split(eval_df, test_size= 0.2, stratify= eval_df['label'], random_state= 1)
print(len(train_df), len(val_df), len(test_df))

In [None]:
# load training and evaluation data
train_features, train_labels = load_images(train_df)
val_features, val_labels = load_images(val_df)
test_features, test_labels = load_images(test_df)

In [None]:
# check lengths of training and evaluation  sets
len(train_features), len(train_labels), len(test_features), len(test_labels), len(val_features), len(val_labels) 

In [None]:
# get image shape
InputShape = train_features[0].shape
print(InputShape)

In [None]:
img_cols,img_rows = 224,224
train_features= train_features.reshape(train_features.shape[0], img_cols*img_rows*3)

In [None]:
# normalize the features
X_train = train_features/255
X_val  = val_features/255
X_test  = test_features/255

In [None]:
y_train = train_labels
y_test = test_labels
y_val = val_labels

In [None]:
y_train.shape

In [None]:
X_train.shape

In [None]:
from collections import Counter
counter = Counter(y_train)
print(counter)

In [None]:
from imblearn.over_sampling import SMOTE
#x_train = pd.DataFrame(X_train)
X_resample, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [None]:
X_resample = X_resample.reshape(X_resample.shape[0], img_cols,img_rows,3)

In [None]:
X_resample.shape

In [None]:
from collections import Counter
counter = Counter(y_resampled)
print(counter)

In [None]:
BATCH_SIZE = 32
NUM_CLASSES = 3
EPOCHS = 50
INPUT_SHAPE = (224, 224, 3)

In [None]:
le = LabelEncoder().fit(['Type 1', 'Type 2', 'Type 3'])
Y_train = le.transform(y_resampled)
Y_val = le.transform(y_val)
Y_test = le.transform(y_test)

In [None]:
Y_train

In [None]:
# initialize image data generator for training and evaluation sets

train_datagen = ImageDataGenerator(
                                rotation_range = 40,
                                zoom_range = 0.2,
                                width_shift_range=0.2,
                                height_shift_range=0.2,
                                shear_range=0.2,
                                horizontal_flip=True,
                                vertical_flip = True)
test_datagen = ImageDataGenerator()
eval_datagen = ImageDataGenerator()

In [None]:
# apply data augmentation to features
BATCH_SIZE= 32
train_gen = train_datagen.flow(X_resample, Y_train, batch_size= BATCH_SIZE)
val_gen = eval_datagen.flow(X_val, Y_val, batch_size= BATCH_SIZE)
test_gen = eval_datagen.flow(X_test, Y_test, batch_size= BATCH_SIZE)

In [None]:
# show shape of each  batch
for data_batch, labels_batch in train_gen:
    print('data batch shape: {} \n labels batch shape: {}'.format(data_batch.shape, labels_batch.shape))
    break

# Model building

In [None]:
# initialize pretrained vgg model base
conv_base = VGG16(weights= 'imagenet', include_top= False, input_shape= (224, 224, 3))
conv_base.summary()

In [None]:
# show trainable layers before freezing
print('This is the number of trainable weights '
'before freezing layers in the conv base:', len(conv_base.trainable_weights))

In [None]:
# freeze few layers of pretrained model
for layer in conv_base.layers[:-4]:
    layer.trainable= False
    # Check the trainable status of the individual layers
for layer in conv_base.layers:
    print(layer, layer.trainable)
conv_base.summary()

In [None]:
# show trainable layers after freezing
print('This is the number of trainable weights '
'after freezing layers in the conv base:', len(conv_base.trainable_weights))

In [None]:
# build model 
model = Sequential([conv_base, 
                    Flatten(),
                    Dense(1024, activation='relu'),
                   Dropout(0.5),
                   Dense(3, activation='softmax')])
model.summary()

In [None]:
# compile model
model.compile(optimizer= Adam(0.0001), loss= 'sparse_categorical_crossentropy', metrics= ['accuracy'])

In [None]:
# show model summary
model.summary()

In [None]:
# define training steps
TRAIN_STEPS = len(train_df)//BATCH_SIZE
VAL_STEPS = len(val_df)//BATCH_SIZE

In [None]:
reduceLR = ReduceLROnPlateau(monitor='val_loss', patience=10, verbose= 1, mode='min', factor=  0.2, min_lr = 1e-5)

early_stopping = EarlyStopping(monitor='val_loss', patience = 20, verbose=1, mode='min', restore_best_weights= True)

checkpoint = ModelCheckpoint('cervicalModel_noaug.weights.hdf5', monitor='val_loss', verbose=1,save_best_only=True, mode= 'min')

In [None]:
# train model
history = model.fit(train_gen, steps_per_epoch= TRAIN_STEPS, validation_data=val_gen, validation_steps=VAL_STEPS, epochs= 50,
                   callbacks= [reduceLR, early_stopping, checkpoint])

In [None]:
# read training history into dataframe
history_df = pd.DataFrame(history.history)

In [None]:
# display training and validation history

# display history of accurracy
plt.figure(figsize= (15,6))
plt.subplot(1,2,1)
plt.plot(history_df['accuracy'], label= 'accuracy' )
plt.plot(history_df['val_accuracy'], label= 'val_accuracy')
# history_df[['acc', 'val_acc']]
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy History')
plt.legend()

# display history of loss
plt.subplot(1,2,2)
plt.plot(history_df['loss'], label= 'loss')
plt.plot(history_df['val_loss'], label= 'val_loss')
# history_df[['loss', 'val_loss']].plot()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss History')
plt.legend()

plt.show()

# Model Evaluation

In [None]:
# load best weights into model
model.load_weights('./cervicalModel_aug_moredata.weights.hdf5')

In [None]:
# evaluate model on test set
model.evaluate(test_gen)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
predict_x = model.predict(X_test) 
#print(predict_x)
y_pred = np.argmax(predict_x,axis=1)
y_pred = np.array(y_pred)
y_pred

In [None]:
y_true = Y_test
y_true

In [None]:
accuracy = accuracy_score(y_true,y_pred)
print('Accuracy: %f' % accuracy)

In [None]:
precision = precision_score(y_true,y_pred, average='micro')
print('Precision: %f' % precision)

In [None]:
recall = recall_score(y_true,y_pred, average='micro')
print('Recall: %f' % recall)

In [None]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_true,y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_true,y_pred, average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_true,y_pred, average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_true,y_pred, average='weighted')
print('F1 score: %f' % f1)

In [None]:
# Confusion matrix for actual and predicted values.
matrix = confusion_matrix(y_true,y_pred, labels=[0,1,2])
print('Confusion matrix :')
print(matrix)