In [None]:
import os, shutil, random
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import numpy as np
import cv2
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split
from PIL import Image


# 1. Kaggle Dataset Download


# Use a raw string or forward slashes to avoid unicode escape issues on Windows
base_dir = r"C:\Users\rites\Downloads\Projects\Skin-Cancer-Detection-using-Basic-CNN--Research-Prototype-"
images_dir = os.path.join(base_dir, "images")
metadata_path = os.path.join(base_dir, "GroundTruth.csv")
train_dir = os.path.join(base_dir, "train")
val_dir   = os.path.join(base_dir, "val")


# 2. Split + Resize helper

for split in ["train", "val"]:
    for cls in ["benign", "malignant"]:
        os.makedirs(os.path.join(base_dir, split, cls), exist_ok=True)

train_benign_path = os.path.join(train_dir, "benign")
train_malignant_path = os.path.join(train_dir, "malignant")

def has_images(directory):
    if not os.path.exists(directory):
        return False
    files = [f for f in os.listdir(directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
    return len(files) > 0


def prepare_dataset(split_ratio=0.8, size=(128, 128)):
    if not os.path.exists(images_dir):
        print(f"⚠️  Images directory not found at {images_dir}")
        return False
    if not os.path.exists(metadata_path):
        print(f"⚠️  GroundTruth.csv not found at {metadata_path}")
        return False

    df = pd.read_csv(metadata_path)
    df['filepath'] = df['image'].apply(lambda x: os.path.join(images_dir, f"{x}.jpg"))
    df = df[df['filepath'].apply(os.path.exists)]
    if df.empty:
        print("⚠️  No matching images found between GroundTruth.csv and the images directory")
        return False

    malignant_cols = ["MEL", "BCC", "AKIEC"]
    df['label'] = np.where(df[malignant_cols].sum(axis=1) > 0, "malignant", "benign")

    train_df, val_df = train_test_split(
        df,
        test_size=1 - split_ratio,
        stratify=df['label'],
        random_state=42
    )

    for split_name, split_df in [("train", train_df), ("val", val_df)]:
        for _, row in split_df.iterrows():
            dst_path = os.path.join(base_dir, split_name, row['label'], f"{row['image']}.jpg")
            if os.path.exists(dst_path):
                continue
            try:
                with Image.open(row['filepath']) as img:
                    img = img.convert('RGB').resize(size)
                    img.save(dst_path)
            except Exception as e:
                print(f"Warning: Could not process {row['filepath']}: {e}")

    print("✅ Dataset prepared: train/ and val/ folders with resized images")
    return True


if has_images(train_benign_path) and has_images(train_malignant_path):
    print("✅ Using existing train/val data structure")
else:
    if not prepare_dataset():
        raise ValueError("Dataset preparation failed. Please ensure images and GroundTruth.csv are available.")

# 3. Data Generators

train_datagen = ImageDataGenerator(rescale=1./255,
                                   rotation_range=20,
                                   horizontal_flip=True,
                                   vertical_flip=True)

val_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_directory(
    train_dir, target_size=(128,128), batch_size=32, class_mode='binary')

val_gen = val_datagen.flow_from_directory(
    val_dir, target_size=(128,128), batch_size=32, class_mode='binary')

# Check if data was loaded successfully
if train_gen.samples == 0 or val_gen.samples == 0:
    raise ValueError("No images found in train or validation directories. Please ensure data is properly loaded.")
    
print(f" Loaded {train_gen.samples} training images and {val_gen.samples} validation images")


# 4. CNN Model

model = models.Sequential([
    layers.Conv2D(32, (3,3), activation='relu', input_shape=(128,128,3)),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(128, (3,3), activation='relu'),
    layers.MaxPooling2D(2,2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# 5. Training

history = model.fit(train_gen, epochs=10, validation_data=val_gen)


# 6. Evaluation
plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.legend(); plt.show()

y_true, y_pred = [], []
for images, labels in val_gen:
    preds = model(images, training=False)
    y_true.extend(labels)
    y_pred.extend((preds > 0.5).astype(int).flatten())
    if len(y_true) >= val_gen.samples:
        break

print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
print("Classification Report:\n", classification_report(y_true, y_pred))

fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.legend(); plt.show()

# 7. Grad-CAM Visualization (Optional - requires a test image)

def make_gradcam_heatmap(img_array, model, last_conv_layer_name):
    grad_model = tf.keras.models.Model(
        [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output]
    )
    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array)
        loss = predictions[:, 0]
    grads = tape.gradient(loss, conv_outputs)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
    conv_outputs = conv_outputs[0]
    heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
    heatmap = tf.squeeze(heatmap)
    heatmap = np.maximum(heatmap, 0) / np.max(heatmap)
    return heatmap

# Try to find a sample image for Grad-CAM
sample_paths = [
    "./data/sample.jpg",
    "./data/sample.png",
    os.path.join(train_dir, "benign"),
    os.path.join(train_dir, "malignant"),
    os.path.join(val_dir, "benign"),
    os.path.join(val_dir, "malignant")
]

img_path = None
for path in sample_paths:
    if os.path.isfile(path) and path.lower().endswith(('.png', '.jpg', '.jpeg')):
        img_path = path
        break
    elif os.path.isdir(path):
        files = [f for f in os.listdir(path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        if len(files) > 0:
            img_path = os.path.join(path, files[0])
            break

if img_path and os.path.exists(img_path):
    try:
        last_conv_layer_name = [layer.name for layer in model.layers if isinstance(layer, layers.Conv2D)][-1]
        img = tf.keras.preprocessing.image.load_img(img_path, target_size=(128,128))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0) / 255.0
        
        heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name)
        plt.matshow(heatmap)
        plt.title("Grad-CAM Heatmap")
        plt.show()
        
        img_cv = cv2.imread(img_path)
        if img_cv is not None:
            heatmap_resized = cv2.resize(heatmap.numpy(), (img_cv.shape[1], img_cv.shape[0]))
            heatmap_resized = np.uint8(255 * heatmap_resized)
            heatmap_colored = cv2.applyColorMap(heatmap_resized, cv2.COLORMAP_JET)
            superimposed_img = cv2.addWeighted(img_cv, 0.6, heatmap_colored, 0.4, 0)
            cv2.imshow("Grad-CAM", superimposed_img)
            cv2.waitKey(0)
            cv2.destroyAllWindows()
    except Exception as e:
        print(f" Grad-CAM visualization skipped: {e}")
else:
    print("Grad-CAM visualization skipped: No sample image found")

# 8. Save Model

model.save("skin_cancer_cnn.h5")
print(" Model saved as skin_cancer_cnn.h5")

✅ Dataset prepared: train/ and val/ folders with resized images
Found 8012 images belonging to 2 classes.
Found 2003 images belonging to 2 classes.
 Loaded 8012 training images and 2003 validation images


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 1s/step - accuracy: 0.8049 - loss: 0.4720 - val_accuracy: 0.8048 - val_loss: 0.4283
Epoch 2/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 503ms/step - accuracy: 0.8049 - loss: 0.4544 - val_accuracy: 0.8048 - val_loss: 0.4932
Epoch 3/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 467ms/step - accuracy: 0.8049 - loss: 0.4269 - val_accuracy: 0.8048 - val_loss: 0.4070
Epoch 4/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 449ms/step - accuracy: 0.8049 - loss: 0.4385 - val_accuracy: 0.8048 - val_loss: 0.4128
Epoch 5/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 528ms/step - accuracy: 0.8045 - loss: 0.4093 - val_accuracy: 0.8058 - val_loss: 0.4016
Epoch 6/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 498ms/step - accuracy: 0.8067 - loss: 0.3981 - val_accuracy: 0.8088 - val_loss: 0.3803
Epoch 7