In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from glob import glob
import sklearn
import cv2
import gc
import os

import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow import keras
from keras import layers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')


In [None]:
# Specify the path where the classes (subdirectories) are located
path = '/kaggle/input/lung-and-colon-cancer-histopathological-images/lung_colon_image_set/lung_image_sets'

# Use the os.listdir() function to list the subdirectories (classes) in the specified path
classes = os.listdir(path)

# Print the list of classes
print(classes)

In [None]:
# Iterate over each category (subdirectory) in the classes list
for category in classes:
    # Define the image directory path for the current category
    image_dir = f'{path}/{category}'
    
    # List all the image files in the image directory
    images = os.listdir(image_dir)

    # Create a figure with three subplots for displaying the images
    fig, ax = plt.subplots(1, 3, figsize=(15, 5))
    # Set the title for the figure
    fig.suptitle(f'Images for {category} category . . . .', fontsize=20)

    # Iterate three times to display three random images from the category
    for i in range(3):
        # Generate a random index within the range of the number of images in the category
        k = np.random.randint(0, len(images))
        # Read the image using PIL and convert it to a NumPy array
        img = np.array(Image.open(f'{path}/{category}/{images[k]}'))
        # Display the image in the corresponding subplot
        ax[i].imshow(img)
        ax[i].axis('off')
    
    # Show the figure with the three images for the current category
    plt.show()


In [None]:
#Images will be resized to this size before being fed to the model
IMG_SIZE = 256

#testing: 20% and training: 80%
SPLIT = 0.2

#CNN model will iterate 10 times over the entire training dataset during the training process
#Epoch: each iteration over the full dataset
EPOCHS = 10

#no of samples that will be processed by the CNN model in a single forward and backward pass.
BATCH_SIZE = 64

In [None]:
#stores preprocessed image data
X = []
#stores corresponding labels
Y = []

#i: index, category: element
for i, category in enumerate(classes):
  #The glob function returns a list of filenames that match the path
    images = glob(f'{path}/{category}/*.jpeg')
    count = 0
    

    for image in images:
        # Stop processing images for this category if 1500 images are already processed
        if count >= 2000:
            break  
        #loads image as numpy array in 'img'
        img = cv2.imread(image)

        #resizes image and appends to the array X
        X.append(cv2.resize(img, (IMG_SIZE, IMG_SIZE)))
        #append category index of current image to Y
        Y.append(i)
        
        count += 1

#converts X into numpy array.
X = np.asarray(X)

#each label in Y is one hot encoded in a binary vector and the resulting list is converted to a numpy array
one_hot_encoded_Y = pd.get_dummies(Y).values

In [None]:
#splitting into testing and training set
X_train, X_val, Y_train, Y_val = train_test_split(
    X, one_hot_encoded_Y,
    test_size = SPLIT,
    #ensures that each time the programming is run, the splitting is the same
    random_state = 2022
)
print(X_train.shape, X_val.shape)

In [None]:
# Defining the base model
base_model = ResNet50(
    # Fully connected layer on top of the network should not be included
    include_top=False, 
    weights="imagenet",
    # 3: colour channels RGB
    input_shape=(IMG_SIZE, IMG_SIZE, 3), 
    # Pooling layers downsample the input by dividing the input's spatial dimensions into smaller regions and summarizing each region into a single value.
    # Convolutional layers will be averaged spatially before being passed to the next layers
    pooling='avg'
)

# Create the model architecture by adding layers on top of the pre-trained ResNet50 base model
model = keras.models.Sequential(
    [
        # Base model is added as first layer
        base_model,
        # Dense layer: every neuron in the layer is connected to every neuron in the previous layer and produces o/p that acts as i/p to the next layer
        # Adds a fully connected layer with 256 units and ReLU activation function
        layers.Dense(256, activation='relu'),
        # Normalize the inputs to the next layer
        layers.BatchNormalization(),
        layers.Dense(128, activation='relu'),
        # Randomly sets 30% of the inputs to 0 during training to avoid overfitting
        layers.Dropout(0.3),
        layers.BatchNormalization(),
        # Adds the output layer with a number of units equal to the number of classes in the dataset
        # Softmax produces probability scores for each class
        layers.Dense(len(classes), activation='softmax')
    ]
)

In [None]:
model.summary()

In [None]:
# Generates a diagram that illustrates the structure and connectivity of the model's layers
keras.utils.plot_model(
    model,
    show_shapes=True,
    show_dtype=True,
    show_layer_activations=True
)

In [None]:
# Used to configure the model for training
model.compile(
    # Specifies the algorithm used to update the weights of the model during training should be adam
    optimizer=keras.optimizers.Adam(
        learning_rate = 1e-3, beta_1 = 0.9, beta_2 = 0.999, amsgrad = False
    ),
    # Categorical_crossentropy is usually used where the target variable is one-hot encoded
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


In [None]:
class CustomStop(tf.keras.callbacks.Callback):
    def check_validation_accuracy(self, epoch, logs={}):
        # Checks if the validation accuracy has reached a threshold of 90%, after which it stops training
        if logs.get('val_accuracy') > 0.97:
            print('\n Validation accuracy has reached up to 97%, stopping further training.')
            self.model.stop_training = True

# Waits for 3 epochs before stopping training if there are no improvements and picks weights of the best performing epoch before stopping training.
early_stopping = EarlyStopping(
    patience=3,
    monitor='val_accuracy',
    restore_best_weights=True
)

# Learning rate is reduced by a factor of 0.5 when there has been no improvements for 2 epochs
reduce_learning_rate = ReduceLROnPlateau(
    monitor='val_loss',
    patience=2,
    factor=0.5,
    verbose=1
)

In [None]:
# Trains the model using the specified parameters and callbacks
history = model.fit(
    X_train, Y_train,
    validation_data=(X_val, Y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    callbacks=[early_stopping, reduce_learning_rate, CustomStop()]
)

In [None]:
# Creates a dataframe history_df where the history object contains the training history, including the loss and metrics values at each epoch
history_df = pd.DataFrame(history.history)
# Plotting training and validation loss
history_df.loc[:, ['loss', 'val_loss']].plot()
# Plotting training and validation accuracy
history_df.loc[:, ['accuracy', 'val_accuracy']].plot()
plt.show()

In [None]:
Y_pred = model.predict(X_val)
# Convert one-hot encoded format to their original form
Y_val = np.argmax(Y_val, axis=1)
Y_pred = np.argmax(Y_pred, axis=1)

In [None]:
# Print confusion matrix
print(metrics.confusion_matrix(Y_val, Y_pred))

In [None]:
print(metrics.classification_report(Y_val, Y_pred, target_names=classes))