In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50, EfficientNetB0, InceptionV3
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import cv2
import os
import pandas as pd

In [None]:
def define_paths(dir):
    filepaths = []
    labels = []
    folds = os.listdir(dir)
    for fold in folds:
        foldpath = os.path.join(dir, fold)
        filelist = os.listdir(foldpath)
        for file in filelist:
            fpath = os.path.join(foldpath, file)
            filepaths.append(fpath)
            labels.append(fold)
    return filepaths, labels
def define_df(files, classes):
    Fseries = pd.Series(files, name= 'filepaths')
    Lseries = pd.Series(classes, name='labels')
    return pd.concat([Fseries, Lseries], axis= 1)

def create_df(tr_dir, val_dir, ts_dir):
    # train dataframe
    files, classes = define_paths(tr_dir)
    train_df = define_df(files, classes)

    # validation dataframe
    files, classes = define_paths(val_dir)
    valid_df = define_df(files, classes)
    # test dataframe
    files, classes = define_paths(ts_dir)
    test_df = define_df(files, classes)
    return train_df, valid_df, test_df

In [None]:
def create_gens(train_df, valid_df, test_df, batch_size):
    img_size = (224, 224)
    channels = 3
    img_shape = (img_size[0], img_size[1], channels)
    ts_length = len(test_df)
    test_batch_size = test_batch_size = max(sorted([ts_length // n for n in range(1, ts_length + 1) if ts_length%n == 0 and ts_length/n <= 80]))
    test_steps = ts_length // test_batch_size
    def scalar(img):
        return img
    tr_gen = ImageDataGenerator(preprocessing_function= scalar, horizontal_flip= True)
    ts_gen = ImageDataGenerator(preprocessing_function= scalar)
    train_gen = tr_gen.flow_from_dataframe( train_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                        color_mode= 'rgb', shuffle= True, batch_size= batch_size)
    valid_gen = ts_gen.flow_from_dataframe( valid_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                        color_mode= 'rgb', shuffle= True, batch_size= batch_size)
    test_gen = ts_gen.flow_from_dataframe( test_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                        color_mode= 'rgb', shuffle= False, batch_size= test_batch_size)
    return train_gen, valid_gen, test_gen

**Load and Preprocess Dataset**

Assuming you have your dataset in different folders for each class, use ImageDataGenerator to load and preprocess the data:

In [None]:
# Get Dataframes
train_dir = '/content/drive/MyDrive/DSE Project/Data/train'
test_dir = '/content/drive/MyDrive/DSE Project/Data/test'
valid_dir = '/content/drive/MyDrive/DSE Project/Data/valid'
train_df, valid_df, test_df = create_df(train_dir, valid_dir, test_dir)

# Get Generators
batch_size = 40
train_data, valid_data, test_data = create_gens(train_df, valid_df, test_df, batch_size)

**Visualize Data Composition**


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Assuming train_data.class_indices provides class names and train_data.labels gives label counts
labels = list(train_data.class_indices.keys())
class_counts = train_data.labels

# Plotting dataset composition
sns.countplot(x=class_counts)
plt.title('Dataset Composition')
plt.xlabel('Class')
plt.ylabel('Number of Images')
plt.xticks(np.arange(len(labels)), labels, rotation=45)
plt.show()


**Define the Models (ResNet50, EfficientNetB0, InceptionV3)**

Create models using ResNet50, EfficientNetB0, and InceptionV3 as base models:

In [None]:
def build_model(base_model):
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dropout(0.5),
        Dense(4, activation='softmax')  # 4 classes: adenocarcinoma, large-cell, squamous-cell, normal
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create base models with pre-trained weights
resnet_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
efficientnet_base = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
inceptionv3_base = InceptionV3(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
densenet_base = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

resnet_model = build_model(resnet_base)
efficientnet_model = build_model(efficientnet_base)
inceptionv3_model = build_model(inceptionv3_base)
densenet_base = build_model(densenet_base)

**Trian the models**

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(train_data.classes),
    y=train_data.classes
)
class_weights = dict(enumerate(class_weights))

# Training ResNet50
resnet_history = resnet_model.fit(train_data, validation_data=valid_data, epochs=50, class_weight=class_weights)

# Training EfficientNetB0
efficientnet_history = efficientnet_model.fit(train_data, validation_data=valid_data, epochs=50, class_weight=class_weights)

# Training InceptionV3
inceptionv3_history = inceptionv3_model.fit(train_data, validation_data=valid_data, epochs=50, class_weight=class_weights)

# Training DenseNet121
densenet_history = densenet_base.fit(train_data, validation_data=valid_data, epochs=50, class_weight=class_weights)


**Plot Training, Validation Accuracy/Loss for Each Model**

In [None]:
def plot_history(history, model_name):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_range = range(len(acc))

    plt.figure(figsize=(12, 8))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title(f'{model_name} - Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title(f'{model_name} - Training and Validation Loss')
    plt.show()

# Plot for ResNet, EfficientNet, and InceptionV3
plot_history(resnet_history, 'ResNet50')
plot_history(efficientnet_history, 'EfficientNetB0')
plot_history(inceptionv3_history, 'InceptionV3')
plot_history(densenet_history, 'DenseNet121')

**Evaluate and Compare Models on Test Data**

In [None]:
# @title
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize
from tensorflow.keras.utils import to_categorical

def evaluate_model(model, test_data, model_name, labels):
    # Evaluate the model on test data
    test_loss, test_acc = model.evaluate(test_data)
    print(f'{model_name} - Test Accuracy: {test_acc*100:.2f}%')

    # Make predictions
    Y_pred = model.predict(test_data)
    y_pred = np.argmax(Y_pred, axis=1)

    # Confusion matrix
    cm = confusion_matrix(test_data.classes, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(f'{model_name} - Confusion Matrix')
    plt.show()

    # Classification report
    print(f'{model_name} - Classification Report')
    # Updated line to use the correct number of labels
    print(classification_report(test_data.classes, y_pred, target_names=labels))

    # Plot ROC curve and AUC
    y_true = test_data.classes
    n_classes = len(labels)

    # Binarize the output labels for multi-class ROC curve
    y_true_bin = label_binarize(y_true, classes=np.arange(n_classes))
    if n_classes == 2:
        fpr, tpr, _ = roc_curve(y_true, Y_pred[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    else:
        # ROC curve for each class
        for i in range(n_classes):
            fpr, tpr, _ = roc_curve(y_true_bin[:, i], Y_pred[:, i])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label=f'Class {labels[i]} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} - Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

# Evaluate all models (ensure 'labels' corresponds to your class names)
# Updated labels to include 4 classes
labels = ['adenocarcinoma', 'large-cell-carcinoma', 'squamous-cell-carcinoma', 'normal']
# evaluate_model(resnet_model, test_data, 'ResNet50', labels)
# evaluate_model(efficientnet_model, test_data, 'EfficientNetB0', labels)
# evaluate_model(inceptionv3_model, test_data, 'InceptionV3', labels)
evaluate_model(densenet_base, test_data, 'DenseNet121', labels)

**Select the Best Model**

After evaluating the test accuracy of all models and reviewing the confusion matrix and classification report, select the model with the highest accuracy or performance metrics.


**Hyperparameter Tuning (On Best Model, e.g., DenseNet 121)**

In [None]:
!pip install keras-tuner

In [None]:
from tensorflow.keras.applications import DenseNet121
from keras_tuner import RandomSearch

def build_hyper_model(hp):
    base_model = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dropout(hp.Float('dropout_rate', min_value=0.3, max_value=0.7, step=0.1)),
        Dense(hp.Int('units', min_value=128, max_value=512, step=64), activation='relu'),
        Dense(4, activation='softmax')  # 4 classes
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='categorical_crossentropy', metrics=['accuracy'])

    return model

tuner = RandomSearch(build_hyper_model, objective='val_accuracy', max_trials=5, executions_per_trial=1,
                     directory='hyperparam_tuning', project_name='best_model_tuning')

# Perform hyperparameter tuning
tuner.search(train_data, validation_data=test_data, epochs=10)

# Get the optimal hyperparameters
best_hp = tuner.get_best_hyperparameters()[0]
print(f"Best hyperparameters: Dropout Rate = {best_hp.get('dropout_rate')}, Units = {best_hp.get('units')}, Learning Rate = {best_hp.get('learning_rate')}")


In [None]:
import matplotlib.pyplot as plt

def plot_history(history, title):
  """Plots the training and validation accuracy and loss curves.

  Args:
    history: The training history object returned by model.fit.
    title: The title of the plot.
  """
  plt.figure(figsize=(10, 5))

  # Accuracy plot
  plt.subplot(1, 2, 1)
  plt.plot(history.history['accuracy'], label='Training Accuracy')
  plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
  plt.title(f'Model Accuracy - {title}')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(loc='lower right')

  # Loss plot
  plt.subplot(1, 2, 2)
  plt.plot(history.history['loss'], label='Training Loss')
  plt.plot(history.history['val_loss'], label='Validation Loss')
  plt.title(f'Model Loss - {title}')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(loc='upper right')

  plt.tight_layout()
  plt.show()

**Plot Hyperparameter Tuning Results**

In [None]:
# Get the best model from tuning
best_model = tuner.get_best_models(num_models=1)[0]

# Retrain the best model with tuned hyperparameters
history = best_model.fit(train_data, validation_data=test_data, epochs=10)

In [None]:
# Plot the results of the tuned model
plot_history(history, 'Best Tuned DenseNet121')

**Final Steps: Save the Best Model**

In [None]:
best_model.save('best_lung_cancer_ct_model_tuned.keras')