In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
import kagglehub
import os
from google.colab import userdata
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

# for images
import tensorflow as tf
from tensorflow.keras import layers, models, applications

# for balancing
from sklearn.utils.class_weight import compute_class_weight

# ISIC Data

In [None]:
# import datasets
ISIC = kagglehub.dataset_download("nodoubttome/skin-cancer9-classesisic")
print(ISIC)


Using Colab cache for faster access to the 'skin-cancer9-classesisic' dataset.
/kaggle/input/skin-cancer9-classesisic


In [None]:
def create_image_dataframe(dataset_path):
    """Create a DataFrame with image paths and labels from folder structure"""
    data = []

    for root, dirs, files in os.walk(dataset_path):
        # Skip the root directory
        if root == dataset_path:
            continue

        # Get class name from folder name
        class_name = os.path.basename(root)

        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                full_path = os.path.join(root, file)
                data.append({
                    'image_path': full_path,
                    'class': class_name,
                    'filename': file
                })

    return pd.DataFrame(data)

# Try to create DataFrame from folder structure
try:
    image_df = create_image_dataframe(ISIC)
    if not image_df.empty:
        print(f"Created image DataFrame: {image_df.shape}")
        print(f"Classes found: {image_df['class'].unique()}")
        print("\nSample data:")
        print(image_df.head())

        # Show class distribution
        print("\nClass distribution:")
        print(image_df['class'].value_counts())
except Exception as e:
    print(f"Could not create image DataFrame: {e}")

Created image DataFrame: (2357, 3)
Classes found: ['pigmented benign keratosis' 'melanoma' 'vascular lesion'
 'actinic keratosis' 'squamous cell carcinoma' 'basal cell carcinoma'
 'seborrheic keratosis' 'dermatofibroma' 'nevus']

Sample data:
                                          image_path  \
0  /kaggle/input/skin-cancer9-classesisic/Skin ca...   
1  /kaggle/input/skin-cancer9-classesisic/Skin ca...   
2  /kaggle/input/skin-cancer9-classesisic/Skin ca...   
3  /kaggle/input/skin-cancer9-classesisic/Skin ca...   
4  /kaggle/input/skin-cancer9-classesisic/Skin ca...   

                        class          filename  
0  pigmented benign keratosis  ISIC_0024371.jpg  
1  pigmented benign keratosis  ISIC_0024358.jpg  
2  pigmented benign keratosis  ISIC_0024337.jpg  
3  pigmented benign keratosis  ISIC_0024382.jpg  
4  pigmented benign keratosis  ISIC_0024420.jpg  

Class distribution:
class
pigmented benign keratosis    478
melanoma                      454
basal cell carcinoma     

# SIIM ISIC Data

In [None]:
# Download the ISIC 2020 resized dataset (256x256)
ISIC_2020 = kagglehub.dataset_download("nischaydnk/isic-2020-jpg-256x256-resized")
print("ISIC 2020 (256x256) dataset downloaded to:", ISIC_2020)

Using Colab cache for faster access to the 'isic-2020-jpg-256x256-resized' dataset.
ISIC 2020 (256x256) dataset downloaded to: /kaggle/input/isic-2020-jpg-256x256-resized


# PREPRCESSING RUN ALL

In [None]:
csv_files = []
for root, dirs, files in os.walk(ISIC_2020):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print("Found CSV files:")
for csv_file in csv_files:
    print(f"- {csv_file}")


# Load the CSV file into a new variable name to avoid conflict
if csv_files:
    metadata_df = pd.read_csv(csv_files[0]) # Keep metadata_df name for the ISIC_2020 metadata
    print(f"\nLoaded dataset shape: {metadata_df.shape}")
    print(f"Columns: {list(metadata_df.columns)}")
    print("\nFirst few rows:")
    print(metadata_df.head())
else:
    print("No CSV files found for ISIC 2020.")
    metadata_df = pd.DataFrame() # Create empty DataFrame if no CSV found


Found CSV files:
- /kaggle/input/isic-2020-jpg-256x256-resized/train-metadata.csv

Loaded dataset shape: (33126, 4)
Columns: ['Unnamed: 0', 'isic_id', 'patient_id', 'target']

First few rows:
   Unnamed: 0       isic_id  patient_id  target
0           0  ISIC_2637011  IP_7279968       0
1           1  ISIC_0015719  IP_3075186       0
2           2  ISIC_0052212  IP_2842074       0
3           3  ISIC_0068279  IP_6890425       0
4           4  ISIC_0074268  IP_8723313       0


In [None]:
# Join the two datasets

# Define a mapping for the first dataset's classes to a binary target
class_to_target = {
    'nevus': 0,
    'seborrheic keratosis': 0,
    'pigmented benign keratosis': 0,
    'melanoma': 1,
    'basal cell carcinoma': 1,
    'actinic keratosis': 0, # Actinic keratosis is pre-malignant, but often treated as benign in datasets
    'vascular lesion': 0,
    'dermatofibroma': 0,
    'squamous cell carcinoma': 1
}

# Apply the mapping to the first dataset
# Ensure image_df from cell 1VvSVQBHh1m_ is used and it has the 'class' column
if 'class' in image_df.columns:
    image_df['target'] = image_df['class'].map(class_to_target)
    # Keep only relevant columns for merging and add source column
    image_df_subset = image_df[['image_path', 'target']].copy()
    image_df_subset.rename(columns={'target': 'label'}, inplace=True) # Rename to avoid confusion with the second dataset's 'target'
    image_df_subset['source'] = 'ISIC' # Add source column
else:
    # This case should ideally not happen if image_df from folder structure is used
    print("Error: 'class' column not found in the image_df from folder structure. Cannot create binary labels for merging.")
    # Create an empty DataFrame with the expected columns to avoid errors during concatenation
    image_df_subset = pd.DataFrame(columns=['image_path', 'label', 'source'])


# For the second dataset, we already have the 'target' column (0 or 1) in metadata_df
if 'target' in metadata_df.columns and 'isic_id' in metadata_df.columns:
    # Construct image paths using 'isic_id' and the base path of the ISIC 2020 dataset
    # --- Debug Print for ISIC_2020 Path ---
    print(f"\nDebug: ISIC_2020 path is: {ISIC_2020}")
    # --- End Debug Print ---
    metadata_df_subset = metadata_df[['isic_id', 'target']].copy()
    # Assuming images are in a 'train' subdir and are .jpg within the ISIC_2020 path
    metadata_df_subset['image_path'] = metadata_df_subset['isic_id'].apply(lambda x: os.path.join(ISIC_2020, 'train-image/image', x + '.jpg'))
    metadata_df_subset.rename(columns={'target': 'label'}, inplace=True) # Rename to match the first dataset subset
    metadata_df_subset = metadata_df_subset[['image_path', 'label']] # Reorder columns
    metadata_df_subset['source'] = 'ISIC_2020' # Add source column

else:
    print("Warning: 'target' or 'isic_id' column not found in the second dataset. Cannot use its labels for merging.")
    # Create an empty DataFrame with the expected columns to avoid errors during concatenation
    metadata_df_subset = pd.DataFrame(columns=['image_path', 'label', 'source'])

# --- Debugging Print Statements ---
print("\n--- Debug Info Before Concatenation ---")
print("image_df_subset shape:", image_df_subset.shape)
print("image_df_subset head:")
display(image_df_subset.head())
print("\nmetadata_df_subset shape:", metadata_df_subset.shape)
print("metadata_df_subset head:")
display(metadata_df_subset.head())
print("--- End Debug Info ---")
# --- End Debugging Print Statements ---


# Concatenate the two dataframes
# Ensure both dataframes have the 'image_path', 'label', and 'source' columns before concatenating
combined_df = pd.concat([image_df_subset, metadata_df_subset], ignore_index=True)

# Drop rows where 'label' is NaN to ensure no missing labels
combined_df.dropna(subset=['label'], inplace=True)

print("Combined DataFrame shape:", combined_df.shape)
print("Combined DataFrame head:")
display(combined_df.head())

# Display class distribution in the combined dataset
print("\nClass distribution in combined dataset:")
print(combined_df['label'].value_counts())

# You can now use 'combined_df' for further preprocessing and model training


Debug: ISIC_2020 path is: /kaggle/input/isic-2020-jpg-256x256-resized

--- Debug Info Before Concatenation ---
image_df_subset shape: (2357, 3)
image_df_subset head:


Unnamed: 0,image_path,label,source
0,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
1,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
2,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
3,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
4,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC



metadata_df_subset shape: (33126, 3)
metadata_df_subset head:


Unnamed: 0,image_path,label,source
0,/kaggle/input/isic-2020-jpg-256x256-resized/tr...,0,ISIC_2020
1,/kaggle/input/isic-2020-jpg-256x256-resized/tr...,0,ISIC_2020
2,/kaggle/input/isic-2020-jpg-256x256-resized/tr...,0,ISIC_2020
3,/kaggle/input/isic-2020-jpg-256x256-resized/tr...,0,ISIC_2020
4,/kaggle/input/isic-2020-jpg-256x256-resized/tr...,0,ISIC_2020


--- End Debug Info ---
Combined DataFrame shape: (35483, 3)
Combined DataFrame head:


Unnamed: 0,image_path,label,source
0,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
1,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
2,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
3,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC
4,/kaggle/input/skin-cancer9-classesisic/Skin ca...,0,ISIC



Class distribution in combined dataset:
label
0    33856
1     1627
Name: count, dtype: int64


In [None]:
import tensorflow as tf
import os

# 0 = Benign, 1 = Malignant

# Preprocessing

# Split the data into training and validation sets using the combined_df
train_df, val_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['label'], random_state=42)

# Define image size
IMG_WIDTH = 256
IMG_HEIGHT = 256

# Function to load and preprocess images
def load_image(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3) # Decode JPG image
    img = tf.image.convert_image_dtype(img, tf.float32) # Convert to float [0, 1]
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH]) # Resize image
    return img, label

# Create tf.data.Dataset from dataframes
train_ds = tf.data.Dataset.from_tensor_slices((train_df['image_path'], train_df['label']))
val_ds = tf.data.Dataset.from_tensor_slices((val_df['image_path'], val_df['label']))

# Apply the loading and preprocessing function to the datasets
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.map(load_image, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(load_image, num_parallel_calls=AUTOTUNE)

# Configure datasets for performance
BATCH_SIZE = 32

train_ds = train_ds.cache().shuffle(1000).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)


# Balance dataset (calculate class weights based on the combined training data)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)

# Data augmentation
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1)
])

train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y), num_parallel_calls=AUTOTUNE)

Class weights: {0: np.float64(0.5240363314133806), 1: np.float64(10.900921658986174)}


In [None]:
def load_and_preprocess_image_for_cnn(row, target_size=(256, 256)):
    """Loads, decodes, and preprocesses an image for CNN input based on source."""
    image_path = row['image_path']
    source = row['source']

    # Determine the correct base path based on the source
    if source == 'ISIC':
        base_path = ISIC
    elif source == 'ISIC_2020':
        base_path = ISIC_2020
    else:
        print(f"Warning: Unknown source '{source}' for image '{image_path}'. Cannot load.")
        return None

    # Construct the full image path
    full_image_path = os.path.join(base_path, os.path.basename(image_path))


    try:
        img = Image.open(full_image_path).convert('RGB')  # Ensure image is in RGB format
        img = img.resize(target_size)
        return np.array(img)
    except Exception as e:
        print(f"Error loading image {full_image_path}: {e}")
        return None

# Define the target size for the images
target_size = (IMG_WIDTH, IMG_HEIGHT)

In [None]:
# Use combined_df for splitting
X = combined_df['image_path']
y = combined_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 24838
Testing set size: 10645


# PREPROCESSING END

In [None]:
import os

# Check if image files in train_df exist
print("Checking if image files in the training dataset exist...")
all_files_exist = True
checked_count = 0
for image_path in train_df['image_path']:
    if not os.path.exists(image_path):
        all_files_exist = False
        # Stop after finding a few missing files to avoid excessive output
        checked_count += 1
        if checked_count <= 5:
            print(f"Image file not found: {image_path}")

if all_files_exist:
    print("All checked image files in the training dataset exist.")
elif checked_count > 0:
    print("Some image files were not found. Please check the paths.")
else:
    print("No image files to check in the training dataset.")

Checking if image files in the training dataset exist...
All checked image files in the training dataset exist.


# EfficientNet Model for Skin Cancer Classification
We'll use EfficientNetB0 as our base model with transfer learning to classify skin lesions as benign (0) or malignant (1).

In [None]:
# Import required libraries for EfficientNet
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import AUC, Precision, Recall

def create_efficientnet_model():
    # Load EfficientNetB0 with pre-trained weights
    base_model = EfficientNetB0(
        weights='imagenet',
        include_top=False,
        input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)
    )
    
    # Fine-tune the last few layers
    # Unfreeze the last 30 layers
    for layer in base_model.layers[-30:]:
        layer.trainable = True
    
    # Keep all other layers frozen
    for layer in base_model.layers[:-30]:
        layer.trainable = False
    
    # Create the model
    model = tf.keras.Sequential([
        # Input layer
        tf.keras.layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
        
        # Data augmentation layers (already applied in preprocessing)
        
        # Base EfficientNet model
        base_model,
        
        # Additional layers for classification
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    return model

# Create the model
model = create_efficientnet_model()

# Compile the model with appropriate metrics for medical imaging
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        AUC(name='auc'),
        Precision(name='precision'),
        Recall(name='recall')
    ]
)

# Display model summary
model.summary()

In [None]:
# Define callbacks for training
callbacks = [
    # Early stopping to prevent overfitting
    EarlyStopping(
        monitor='val_auc',
        mode='max',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    
    # Reduce learning rate when training plateaus
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    
    # Save the best model
    ModelCheckpoint(
        'best_efficientnet_skin_cancer_model.h5',
        monitor='val_auc',
        mode='max',
        save_best_only=True,
        verbose=1
    )
]

# Train the model
epochs = 50

print("Starting model training...")
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks,
    class_weight=class_weights,
    verbose=1
)

In [None]:
# Plot training history
plt.style.use('seaborn')
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Training Metrics', fontsize=16)

# Plot accuracy
axes[0, 0].plot(history.history['accuracy'], label='Train')
axes[0, 0].plot(history.history['val_accuracy'], label='Validation')
axes[0, 0].set_title('Model Accuracy')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].legend()

# Plot loss
axes[0, 1].plot(history.history['loss'], label='Train')
axes[0, 1].plot(history.history['val_loss'], label='Validation')
axes[0, 1].set_title('Model Loss')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Loss')
axes[0, 1].legend()

# Plot AUC
axes[1, 0].plot(history.history['auc'], label='Train')
axes[1, 0].plot(history.history['val_auc'], label='Validation')
axes[1, 0].set_title('Model AUC')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('AUC')
axes[1, 0].legend()

# Plot Precision and Recall
axes[1, 1].plot(history.history['precision'], label='Precision')
axes[1, 1].plot(history.history['recall'], label='Recall')
axes[1, 1].set_title('Precision and Recall')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Score')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

# Create test dataset
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_ds = test_ds.map(load_image, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

# Evaluate model on test set
print("\nEvaluating model on test set...")
test_results = model.evaluate(test_ds, verbose=1)
print("\nTest Results:")
for metric_name, value in zip(model.metrics_names, test_results):
    print(f"{metric_name}: {value:.4f}")

# Get predictions
print("\nGenerating predictions...")
y_pred_proba = model.predict(test_ds)
y_pred = (y_pred_proba > 0.5).astype(int)
y_test_array = np.array(list(y_test))

In [None]:
# Generate detailed classification metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_array, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test_array, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Display some example predictions
def plot_example_predictions(num_examples=5):
    plt.figure(figsize=(15, 3))
    for i, (image, label) in enumerate(test_ds.take(1)):
        if i >= num_examples:
            break
            
        for j in range(min(num_examples, len(image))):
            plt.subplot(1, num_examples, j + 1)
            plt.imshow(image[j])
            plt.axis('off')
            true_label = 'Malignant' if label[j] == 1 else 'Benign'
            pred_label = 'Malignant' if y_pred[i * BATCH_SIZE + j] == 1 else 'Benign'
            color = 'green' if label[j] == y_pred[i * BATCH_SIZE + j] else 'red'
            plt.title(f'True: {true_label}\nPred: {pred_label}', color=color)
    
    plt.tight_layout()
    plt.show()

print("\nDisplaying example predictions...")
plot_example_predictions()