In [None]:
from google.colab import files
files.upload()  # Upload the kaggle.json file

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"

In [None]:
!kaggle datasets download -d abdullahhasansajjad/lunghist700


In [None]:
!unzip lunghist700.zip -d /content/LungHist700

In [None]:
import os
import random
import shutil
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import models, layers
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.optimizers import Adam

# === 1. Data Splitting ===
def split_dataset(original_data_dir, output_dir, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    random.seed(seed)
    os.makedirs(output_dir, exist_ok=True)
    train_dir = os.path.join(output_dir, "train")
    val_dir = os.path.join(output_dir, "val")
    test_dir = os.path.join(output_dir, "test")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    for class_name in os.listdir(original_data_dir):
        class_path = os.path.join(original_data_dir, class_name)
        if not os.path.isdir(class_path):
            continue
        images = os.listdir(class_path)
        random.shuffle(images)

        n = len(images)
        n_train = int(n * train_ratio)
        n_val = int(n * val_ratio)
        n_test = n - n_train - n_val

        train_images = images[:n_train]
        val_images = images[n_train:n_train+n_val]
        test_images = images[n_train+n_val:]

        for split_name, split_images in zip(
            [train_dir, val_dir, test_dir],
            [train_images, val_images, test_images]
        ):
            os.makedirs(os.path.join(split_name, class_name), exist_ok=True)
            for img in split_images:
                shutil.copy(os.path.join(class_path, img), os.path.join(split_name, class_name, img))

    print("Dataset successfully split!")

# === Paths ===
original_data_dir = "/content/LungHist700/LungHist700_combined/data/images/"
output_dir = "/content/output_split"


split_dataset(original_data_dir, output_dir)

In [None]:
# === 2. Data Augmentation & Generators ===
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    shear_range=0.1,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    os.path.join(output_dir, "train"),
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

val_generator = val_test_datagen.flow_from_directory(
    os.path.join(output_dir, "val"),
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

test_generator = val_test_datagen.flow_from_directory(
    os.path.join(output_dir, "test"),
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

In [None]:
# === 3. Compute Class Weights ===
# Extract labels from train generator to compute weights
y_train = []
for _, labels in train_generator:
    y_train.extend(np.argmax(labels, axis=1))
    if len(y_train) >= train_generator.samples:
        y_train = y_train[:train_generator.samples]
        break

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}
print("Class weights:", class_weight_dict)

In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

# === 1. Load Pretrained Base ===
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=IMG_SIZE + (3,))
base_model.trainable = False  # Freeze all layers

# === 2. Build Model on Top ===
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(train_generator.num_classes, activation='softmax')  # Output layer
])

# === 3. Compile the Model ===
model.compile(
    optimizer=Adam(learning_rate=1e-4),  # Set learning rate
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# === 4. Train the Model ===
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=60,  # You can increase this if needed
    class_weight=class_weight_dict  # Apply class weights if there's imbalance
)


In [None]:
# === 6. Evaluate on Test Set ===
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test accuracy: {test_acc:.4f}")

# Predictions and true labels
y_pred_probs = model.predict(test_generator)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = test_generator.classes

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=list(train_generator.class_indices.keys())))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

# Plot Confusion Matrix
import seaborn as sns

plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=train_generator.class_indices.keys(),
            yticklabels=train_generator.class_indices.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# === 7. Plot Training Graphs ===
def plot_history(h):
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(h.history['accuracy'], label='train_acc')
    plt.plot(h.history['val_accuracy'], label='val_acc')
    plt.legend()
    plt.title('Accuracy')

    plt.subplot(1,2,2)
    plt.plot(h.history['loss'], label='train_loss')
    plt.plot(h.history['val_loss'], label='val_loss')
    plt.legend()
    plt.title('Loss')

plot_history(history)
plot_history(history_fine)

In [None]:
# === 8. ROC-AUC (One-vs-Rest) for multi-class ===
from sklearn.preprocessing import label_binarize

# Binarize labels for ROC-AUC calculation
n_classes = train_generator.num_classes
y_true_bin = label_binarize(y_true, classes=range(n_classes))

# Compute ROC AUC for each class
roc_auc = {}
for i in range(n_classes):
    roc_auc[i] = roc_auc_score(y_true_bin[:, i], y_pred_probs[:, i])

print("ROC-AUC scores per class:")
for cls, score in zip(train_generator.class_indices.keys(), roc_auc.values()):
    print(f"{cls}: {score:.3f}")

# Plot ROC Curves
plt.figure(figsize=(8, 6))
for i, class_name in enumerate(train_generator.class_indices.keys()):
    RocCurveDisplay.from_predictions(y_true_bin[:, i], y_pred_probs[:, i], name=class_name)
plt.title("ROC Curves for each class")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

# y_true and y_pred are lists or numpy arrays of true labels and predicted labels
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")
