# EfficientNet B5
## Let's Begin....

In [None]:
# Import Neccessary Lib...
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns


import os
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import cv2

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications import VGG19
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.regularizers import l1, l2

In [None]:
# Directory paths
train_dir = '/kaggle/input/lungcancer4types-imagedataset/Data/test'
test_dir = '/kaggle/input/lungcancer4types-imagedataset/Data/train'
valid_dir = '/kaggle/input/lungcancer4types-imagedataset/Data/valid'

In [None]:
import os
import pandas as pd

# Function to create a DataFrame from image files in a folder
def create_dataframe(folder_path):
    # Initialize an empty dictionary to store image paths and labels
    data = {'Image_Path': [], 'Label': []}

    # List all subdirectories (labels) in the given folder
    labels = os.listdir(folder_path)

    # Loop through each label
    for label in labels:
        # Construct the full path to the label folder
        label_path = os.path.join(folder_path, label)

        # Check if the path is a directory
        if os.path.isdir(label_path):
            # List all image files in the label folder
            images = os.listdir(label_path)

            # Loop through each image
            for image in images:
                # Construct the full path to the image
                image_path = os.path.join(label_path, image)

                # Append image path and label to the dictionary
                data['Image_Path'].append(image_path)
                data['Label'].append(label)

    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    return df

# Provide the path to your 'data' folder
data_folder = '/kaggle/input/lungcancer4types-imagedataset/Data'

# Create DataFrames for train, test, and valid using the create_dataframe function
train_df = create_dataframe(os.path.join(data_folder, 'train'))
test_df = create_dataframe(os.path.join(data_folder, 'test'))
valid_df = create_dataframe(os.path.join(data_folder, 'valid'))

# Print the created DataFrames for inspection
print("Train DataFrame:")
print(train_df.head())

In [None]:
print("\nTest DataFrame:")
print(test_df.head())

In [None]:
print("\nValid DataFrame:")
print(valid_df.head())

In [None]:
# Calculate the number of unique classes (labels) in the 'Label' column of the training DataFrame
num_classes = len(train_df['Label'].unique())

# Print the number of classes in the dataset
print(f"We have {num_classes} classes")

# Print the total number of images in the training DataFrame (total rows)
print(f"We have {train_df.shape[0]} images")

In [None]:
# Calculate the number of unique classes (labels) in the 'Label' column of the test DataFrame
num_classes = len(test_df['Label'].unique())

# Print the number of classes in the dataset
print(f"We have {num_classes} classes")

# Print the total number of images in the test DataFrame (total rows)
print(f"We have {test_df.shape[0]} images")

In [None]:
# Calculate the number of unique classes (labels) in the 'Label' column of the valid DataFrame
num_classes = len(valid_df['Label'].unique())

# Print the number of classes in the dataset
print(f"We have {num_classes} classes")

# Print the total number of images in the valid DataFrame (total rows)
print(f"We have {valid_df.shape[0]} images")

In [None]:
# Define the size of the input images
img_size = (224, 224)

# Specify the number of color channels in the images (3 for RGB)
channels = 3

# Specify the color representation ('rgb' for red, green, blue)
color = 'rgb'

# Define the shape of the input images based on size, channels, and color representation
img_shape = (img_size[0], img_size[1], channels)

# Specify the batch size for training
batch_size = 32

# Get the length of the test DataFrame
ts_length = len(test_df)

# Determine an optimal test batch size that evenly divides the length of the test DataFrame
test_batch_size = max(sorted([ts_length // n for n in range(1, ts_length + 1) if ts_length % n == 0 and ts_length / n <= 80]))

# Calculate the number of steps needed to cover the entire test dataset
test_steps = ts_length // test_batch_size

# Define a function 'scalar' that takes an image as input (placeholder, no implementation provided)
def scalar(img):
    return img


In [None]:
tr_gen = ImageDataGenerator(preprocessing_function= scalar,
                            horizontal_flip= True)

# Create an ImageDataGenerator for training with specified preprocessing and augmentation settings
tr_gen = ImageDataGenerator(preprocessing_function=scalar, horizontal_flip=True)

# Create an ImageDataGenerator for testing with specified preprocessing settings
ts_gen = ImageDataGenerator(preprocessing_function=scalar)

# Generate a flow from DataFrame for training data
train_gen = tr_gen.flow_from_dataframe(
    train_df,
    x_col='Image_Path',
    y_col='Label',
    target_size=img_size,
    class_mode='categorical',
    color_mode=color,
    shuffle=True,
    batch_size=batch_size
)

# Generate a flow from DataFrame for validation data
valid_gen = ts_gen.flow_from_dataframe(
    valid_df,
    x_col='Image_Path',
    y_col='Label',
    target_size=img_size,
    class_mode='categorical',
    color_mode=color,
    shuffle=True,
    batch_size=batch_size
)

# Generate a flow from DataFrame for test data
test_gen = ts_gen.flow_from_dataframe(
    test_df,
    x_col='Image_Path',
    y_col='Label',
    target_size=img_size,
    class_mode='categorical',
    color_mode=color,
    shuffle=False,
    batch_size=test_batch_size
)


In [None]:
# Using the EfficientNetB5 pre-trained model as a base model (without the fully connected layers)
base_model = tf.keras.applications.efficientnet.EfficientNetB5(
    include_top=False,     # Exclude the fully connected layers
    weights="imagenet",    # Load pre-trained ImageNet weights
    input_shape=img_shape,  # Specify the input shape for the model
    pooling='max'           # Use global max pooling as the final pooling layer
)

# Constructing the complete model using Sequential API
model = Sequential([
    base_model,  # EfficientNetB5 as the base model
    BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001),  # Batch normalization layer
    Dense(256,
          kernel_regularizer=regularizers.l2(l=0.016),
          activity_regularizer=regularizers.l1(0.006),
          bias_regularizer=regularizers.l1(0.006),
          activation='relu'),  # Dense layer with regularization and ReLU activation
    Dropout(rate=0.45, seed=123),  # Dropout layer for regularization
    Dense(4, activation='softmax')  # Output layer with softmax activation for multi-class classification
])

# Compile the model with specified optimizer, loss function, and evaluation metric
model.compile(
    optimizer=Adamax(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Display a summary of the model architecture
model.summary()


In [None]:
# Retrieve the configuration of the optimizer used in the EfficientNetB5 base model
model.optimizer.get_config()

In [None]:
# Define early stopping to halt training if the validation loss doesn't improve for 'patience' consecutive epochs
early_stop = EarlyStopping(monitor='val_loss',
                           patience=5,
                           verbose=1)
# Define model checkpoint to save the best weights during training based on validation loss
checkpoint = ModelCheckpoint('model_weights_efficient_B5_2.h5',
                             monitor='val_loss',
                             save_best_only=True,
                             save_weights_only=True,
                             mode='min',
                             verbose=1)

# Train the EfficientNetB5 base model on the training data with validation using the generator
# - x: Training generator
# - steps_per_epoch: Number of batches to process in each epoch
# - epochs: Number of training epochs
# - callbacks: List of callbacks to apply during training (early stopping and model checkpoint)
# - validation_data: Validation generator for evaluating the model's performance on a separate dataset

history = model.fit(x= train_gen,
                    steps_per_epoch = 20,
                    epochs= 100,
                    callbacks=[early_stop, checkpoint],
                    validation_data = valid_gen)

In [None]:
# Calculate the total number of samples in the test dataset
ts_length = len(test_df)
# Determine the optimal test batch size within a reasonable range (1 to 80)
test_batch_size = max(sorted([ts_length // n for n in range(1, ts_length + 1) if ts_length%n == 0 and ts_length/n <= 80]))
# Calculate the number of steps to cover the entire test dataset using the determined test batch size
test_steps = ts_length // test_batch_size

# Evaluate the EfficientNetB5base model on the training dataset and print the results
train_score = model.evaluate(train_gen, steps= test_steps, verbose= 1)
# Evaluate the EfficientNetB5 base model on the validation dataset and print the results
valid_score = model.evaluate(valid_gen, steps= test_steps, verbose= 1)
# Evaluate the EfficientNetB5 base model on the test dataset and print the results
test_score = model.evaluate(test_gen, steps= test_steps, verbose= 1)

# Print the evaluation results for the training dataset
print("Train Loss: ", train_score[0])
print("Train Accuracy: ", train_score[1])
print('-' * 20)

# Print the evaluation results for the validation dataset
print("Validation Loss: ", valid_score[0])
print("Validation Accuracy: ", valid_score[1])
print('-' * 20)

# Print the evaluation results for the test dataset
print("Test Loss: ", test_score[0])
print("Test Loss: ", test_score[0])
print("Test Accuracy: ", test_score[1])

# EfficientNet B5
## (The Above model is EfficientNetB5 which shows best accuracy compare to other models)
## Train Accuracy: 100%
## Validation Accuracy: 90.2%
## Test Accuracy: 91.11%

# VGG19
## Train Accuracy: 100%
## Validation Accuracy: 80.56%
## Test Accuracy: 79.05%

# VGG16
## Train Accuracy: 100%
## Validation Accuracy:  79.16%
## Test Accuracy:  76.19%