In [None]:
import tensorflow as tf

class MyImageDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, image_data_generator, **kwargs):
        super().__init__()  # Call super().__init__() to avoid the warning
        self.image_data_generator = image_data_generator
        self.kwargs = kwargs

    def __len__(self):
        return len(self.image_data_generator)

    def __getitem__(self, index):
        return self.image_data_generator[index]

In [None]:
import os
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from PIL import Image
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="Your `PyDataset` class should call `super().__init__\\(\\*\\*kwargs\\)`")

In [None]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Define dataset paths
base_path = '/content/drive/MyDrive/LungcancerDataSet/Data'
train_path = os.path.join(base_path, 'train')
test_path = os.path.join(base_path, 'test')

# Image dimensions
IMG_WIDTH, IMG_HEIGHT = 128, 128

# --- Data Cleaning: Verify and clean corrupted files ---
def clean_data(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            try:
                img_path = os.path.join(root, file)
                img = Image.open(img_path)
                img.verify()  # Verify if the image is not corrupted
            except (IOError, SyntaxError) as e:
                print(f"Removing corrupted file: {img_path}")
                os.remove(img_path)

clean_data(train_path)
clean_data(test_path)

# --- Data Integration: Merge training and validation data ---
# Merge train and validation into a single dataset for K-Fold Cross Validation
filepaths = []
labels = []
for class_dir in os.listdir(train_path):
    class_path = os.path.join(train_path, class_dir)
    for file in os.listdir(class_path):
        filepaths.append(os.path.join(class_path, file))
        labels.append(class_dir)

data = pd.DataFrame({"filepaths": filepaths, "labels": labels})

# --- Data Augmentation and Normalization ---
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

test_datagen = ImageDataGenerator(rescale=1./255)

# --- Stratified 8-Fold Cross Validation ---
X = data['filepaths']
y = data['labels']

kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
fold_no = 1
fold_results = []

for train_index, val_index in kf.split(X, y):
    print(f"\nTraining fold {fold_no}")

    # Create train and validation splits
    train_data = data.iloc[train_index]
    val_data = data.iloc[val_index]

    train_generator = datagen.flow_from_dataframe(
        train_data,
        x_col='filepaths',
        y_col='labels',
        target_size=(IMG_WIDTH, IMG_HEIGHT),
        batch_size=32,
        class_mode='categorical'
    )

    val_generator = datagen.flow_from_dataframe(
        val_data,
        x_col='filepaths',
        y_col='labels',
        target_size=(IMG_WIDTH, IMG_HEIGHT),
        batch_size=32,
        class_mode='categorical'
    )

    # Load pre-trained VGG16 model
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))

    # Fine-tune VGG16
    for layer in base_model.layers[:10]:
        layer.trainable = False

    # Define the model
    model = Sequential([
        base_model,
        GlobalAveragePooling2D(),
        Dropout(0.5),
        Dense(256, activation='relu', kernel_regularizer=l2(0.02)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(128, activation='relu', kernel_regularizer=l2(0.02)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        #Access the number of classes using len(train_generator.class_indices)
        Dense(len(train_generator.class_indices), activation='softmax')
        ])
    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

    # Train the model
    history = model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=25,
        callbacks=[early_stopping, lr_scheduler]
    )

    # Save the model for the fold
    model.save(f'best_lung_cancer_cnn_model_fold{fold_no}.h5')

    # Evaluate on the validation set
    val_loss, val_accuracy = model.evaluate(val_generator)
    fold_results.append(val_accuracy)
    print(f"Fold {fold_no} Validation Accuracy: {val_accuracy:.2f}")

    # Plot training and validation accuracy/loss
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'Training and Validation Accuracy (Fold {fold_no})')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Training and Validation Loss (Fold {fold_no})')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

    fold_no += 1

print("\nK-Fold Cross-Validation Completed")

# Evaluate the final model on the test set
test_generator = test_datagen.flow_from_directory(
    test_path,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)

final_test_loss, final_test_accuracy = model.evaluate(test_generator)
print(f"Final Test Accuracy after K-Fold: {final_test_accuracy:.2f}")

# Generate predictions and classification report
y_true = test_generator.classes
y_pred_prob = model.predict(test_generator)
y_pred = np.argmax(y_pred_prob, axis=1)

# Classification report
print("\nClassification Report:")
class_labels = list(test_generator.class_indices.keys())
print(classification_report(y_true, y_pred, target_names=class_labels))

# Confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, cmap='Blues', interpolation='nearest')
plt.title("Confusion Matrix")
plt.colorbar()
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Print average accuracy across folds
print(f"\nAverage Validation Accuracy Across Folds: {np.mean(fold_results):.2f}")
