# **CNN with KFold**

In [None]:
import os
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.applications import VGG16
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message=".*Tracing is expensive.*")

# Function to load images from a folder
def load_images_from_folder(folder, label, target_size=(100, 100)):
    # Check if the folder exists before proceeding
    if not os.path.exists(folder):
        raise FileNotFoundError(f"Folder not found: {folder}")  # Raise a clear error if the folder doesn't exist

    images = []
    labels = []
    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename)).convert('L')  # Convert to grayscale
        if img is not None:
            img = img.resize(target_size)  # Resize the image to a fixed size
            img = np.array(img)
            images.append(img)
            labels.append(label)
    return images, labels


# Define the paths to the folders
benign_folder = '/content/drive/MyDrive/lung_cancer_model/Benign'
malignant_folder = '/content/drive/MyDrive/lung_cancer_model/Malignant'
adenocarcinoma_folder = '/content/drive/MyDrive/lung_cancer_model/Adenocarcinoma'
large_cell_carcinoma_folder = '/content/drive/MyDrive/lung_cancer_model/Large.Cell.Carcinoma'
normal_folder = '/content/drive/MyDrive/lung_cancer_model/Normal'
squamous_cell_carcinoma_folder = '/content/drive/MyDrive/lung_cancer_model/Squamous.Cell.Carcinoma'

# Load images and labels for each category
benign_images, benign_labels = load_images_from_folder(benign_folder, 0)
malignant_images, malignant_labels = load_images_from_folder(malignant_folder, 1)
adenocarcinoma_images, adenocarcinoma_labels = load_images_from_folder(adenocarcinoma_folder, 2)
large_cell_carcinoma_images, large_cell_carcinoma_labels = load_images_from_folder(large_cell_carcinoma_folder, 3)
normal_images, normal_labels = load_images_from_folder(normal_folder, 4)
squamous_cell_carcinoma_images, squamous_cell_carcinoma_labels = load_images_from_folder(squamous_cell_carcinoma_folder, 5)

# Concatenate the images and labels from all categories
images = np.concatenate([benign_images, malignant_images, adenocarcinoma_images, large_cell_carcinoma_images, normal_images, squamous_cell_carcinoma_images], axis=0)
labels = np.concatenate([benign_labels, malignant_labels, adenocarcinoma_labels, large_cell_carcinoma_labels, normal_labels, squamous_cell_carcinoma_labels], axis=0)

# Normalize the pixel values to range [0, 1]
images = images / 255.0

# Reshape the input data to fit the CNN input shape (assuming grayscale images)
height, width = images[0].shape
images = images.reshape((-1, height, width, 1))

# Define the number of folds
n_splits = 16

# Initialize KFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

train_accuracy = []
train_loss = []
val_accuracy = []
val_loss = []

# Initialize lists to store true and predicted labels for all folds
all_true_labels = []
all_predicted_labels = []

# Iterate over the splits
for fold, (train_index, test_index) in enumerate(kf.split(images)):
    print(f"Fold {fold+1}/{n_splits}")

    # Split the dataset into training and testing sets for this fold
    X_train, X_test = images[train_index], images[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

# Define data augmentation
    data_augmentation = tf.keras.Sequential([
        layers.RandomFlip("horizontal_and_vertical"),
        layers.RandomRotation(0.2),
        layers.RandomZoom(0.1),
    ])

    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(height, width, 1), kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(6, activation='softmax')
        ])

    # Preprocess training data with data augmentation
    X_train = data_augmentation(X_train)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    # Define callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

    # Train the model
    history = model.fit(X_train, y_train, epochs=40, batch_size=64, validation_split=0.1, verbose=1,
                        callbacks=[early_stopping, reduce_lr])

    # Evaluate the model on test data
    test_loss, test_accuracy = model.evaluate(X_test, y_test)

    # Make predictions
    predictions_prob = model.predict(X_test)
    predictions = np.argmax(predictions_prob, axis=1)

    # Append true and predicted labels for this fold
    all_true_labels.extend(y_test)
    all_predicted_labels.extend(predictions)

    train_accuracy.append(history.history['accuracy'])
    train_loss.append(history.history['loss'])
    val_accuracy.append(history.history['val_accuracy'])
    val_loss.append(history.history['val_loss'])

# Add VGG16 model for comparison
print("\nEvaluating VGG16 model")

# Load VGG16 base model
vgg16_base = VGG16(weights='imagenet', include_top=False, input_shape=(height, width, 3))
vgg16_base.trainable = False

# Prepare RGB images for VGG16
images_rgb = np.repeat(images, 3, axis=-1)  # Convert grayscale to RGB

# Iterate over the splits for VGG16
vgg16_true_labels = []
vgg16_predicted_labels = []

for fold, (train_index, test_index) in enumerate(kf.split(images_rgb)):
    print(f"VGG16 Fold {fold+1}/{n_splits}")

    # Split the dataset into training and testing sets for this fold
    X_train, X_test = images_rgb[train_index], images_rgb[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # Define the VGG16-based model
    vgg16_model = models.Sequential([
        vgg16_base,
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(6, activation='softmax')
    ])

    # Compile the VGG16 model
    vgg16_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                        loss='sparse_categorical_crossentropy',
                        metrics=['accuracy'])

    # Train the VGG16 model
    vgg16_model.fit(X_train, y_train, epochs=40, batch_size=64, validation_split=0.1, verbose=1)

    # Evaluate the VGG16 model on test data
    vgg16_test_loss, vgg16_test_accuracy = vgg16_model.evaluate(X_test, y_test)
    print(f"VGG16 Fold {fold+1} Accuracy: {vgg16_test_accuracy*100:.2f}%")

    # Make predictions
    vgg16_predictions_prob = vgg16_model.predict(X_test)
    vgg16_predictions = np.argmax(vgg16_predictions_prob, axis=1)

    # Append true and predicted labels for this fold
    vgg16_true_labels.extend(y_test)
    vgg16_predicted_labels.extend(vgg16_predictions)

# Calculate VGG16 overall accuracy
print("\nVGG16 Classification Report:")
print(classification_report(vgg16_true_labels, vgg16_predicted_labels))

print("VGG16 Confusion Matrix:")
vgg16_conf_matrix = confusion_matrix(vgg16_true_labels, vgg16_predicted_labels)
sns.heatmap(vgg16_conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate overall accuracy
print("\nClassification Report:")
print(classification_report(all_true_labels, all_predicted_labels))

print("Confusion Matrix:")
conf_matrix = confusion_matrix(all_true_labels, all_predicted_labels)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# **DISTRIBUTION OF IMAGE CATEGORIES**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Get the maximum number of epochs across all folds
max_epochs = max(len(acc) for acc in train_accuracy)

# Pad shorter accuracy lists with NaN to make them equal length
train_accuracy_padded = [acc + [np.nan] * (max_epochs - len(acc)) for acc in train_accuracy]
val_accuracy_padded = [acc + [np.nan] * (max_epochs - len(acc)) for acc in val_accuracy]
train_loss_padded = [loss + [np.nan] * (max_epochs - len(loss)) for loss in train_loss]
val_loss_padded = [loss + [np.nan] * (max_epochs - len(loss)) for loss in val_loss]


# Create the epochs range based on the maximum number of epochs
epochs = range(max_epochs)

fig, ax = plt.subplots(1, 2)
fig.set_size_inches(20, 10)

# Plot training and validation accuracy
ax[0].plot(epochs, np.nanmean(train_accuracy_padded, axis=0), 'go-', label='Training Accuracy')
ax[0].plot(epochs, np.nanmean(val_accuracy_padded, axis=0), 'ro-', label='Testing Accuracy')
ax[0].set_title('Training & Testing Accuracy')
ax[0].legend()
ax[0].set_xlabel("Epochs")
ax[0].set_ylabel("Accuracy")

# Plot training and validation loss
ax[1].plot(epochs, np.nanmean(train_loss_padded, axis=0), 'g-o', label='Training Loss')
ax[1].plot(epochs, np.nanmean(val_loss_padded, axis=0), 'r-o', label='Testing Loss')
ax[1].set_title('Training & Testing Loss')
ax[1].legend()
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")

plt.show()

# **IMAGES OF EACH CATEGORY**

In [None]:
print("Classification Report:")

# Specify the correct target names, matching the number of classes in your dataset
target_names = ['Benign', 'Malignant', 'Adenocarcinoma', 'Large Cell Carcinoma', 'Normal', 'Squamous Cell Carcinoma']

print(classification_report(all_true_labels, all_predicted_labels, target_names=target_names))

In [None]:
# Find indices of correct predictions
correct_indices = np.where(predictions == y_test)[0]

# Define category names  (This line was added)
category_names = ['Benign', 'Malignant', 'Adenocarcinoma','Large Cell Carcinoma','Normal', 'Squamous Cell Carcinoma']

# Display correct predictions
plt.figure(figsize=(10, 10))
plt.suptitle('Correctly Predicted Samples', fontsize=16)
for i, idx in enumerate(correct_indices[:6]):
    plt.subplot(3, 2, i + 1)
    plt.xticks([])
    plt.yticks([])
    # Reshape to (height, width, 3) for RGB images
    plt.imshow(X_test[idx].reshape(height, width, 3), interpolation='none')
    plt.title("Predicted {}, Actual {}".format(category_names[predictions[idx]], category_names[y_test[idx]]), fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# Find indices of incorrect predictions
incorrect_indices = np.where(predictions != y_test)[0]

# Display incorrect predictions
plt.figure(figsize=(10, 10))
plt.suptitle('Incorrectly Predicted Samples', fontsize=16)
for i, idx in enumerate(incorrect_indices[:6]):
    plt.subplot(3, 2, i + 1)
    plt.xticks([])
    plt.yticks([])
    # Reshape to (height, width, 3) for RGB images or (height, width) for grayscale
    plt.imshow(X_test[idx].reshape(height, width, 3) if X_test[idx].shape[-1] == 3 else X_test[idx].reshape(height, width),
               cmap="gray" if X_test[idx].shape[-1] == 1 else None, interpolation='none')
    plt.title("Predicted {}, Actual {}".format(category_names[predictions[idx]], category_names[y_test[idx]]), fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Define category names
category_names = ['Benign', 'Malignant', 'Adenocarcinoma','Large Cell Carcinoma','Normal', 'Squamous Cell Carcinoma']

# Convert numeric labels to category names
all_true_categories = [category_names[label] for label in all_true_labels]
all_predicted_categories = [category_names[label] for label in all_predicted_labels]

# Compute confusion matrix
cm = confusion_matrix(all_true_categories, all_predicted_categories, labels=category_names)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=category_names, yticklabels=category_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
