In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
import kagglehub
import os
from google.colab import userdata
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

# for images
import tensorflow as tf
from tensorflow.keras import layers, models, applications

# for balancing
from sklearn.utils.class_weight import compute_class_weight

ModuleNotFoundError: No module named 'kagglehub'

# ISIC Data

In [None]:
# import datasets
ISIC = kagglehub.dataset_download("nodoubttome/skin-cancer9-classesisic")
print(ISIC)


/root/.cache/kagglehub/datasets/nodoubttome/skin-cancer9-classesisic/versions/1


In [None]:
def create_image_dataframe(dataset_path):
    """Create a DataFrame with image paths and labels from folder structure"""
    data = []

    for root, dirs, files in os.walk(dataset_path):
        # Skip the root directory
        if root == dataset_path:
            continue

        # Get class name from folder name
        class_name = os.path.basename(root)

        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                full_path = os.path.join(root, file)
                data.append({
                    'image_path': full_path,
                    'class': class_name,
                    'filename': file
                })

    return pd.DataFrame(data)

# Try to create DataFrame from folder structure
try:
    image_df = create_image_dataframe(ISIC)
    if not image_df.empty:
        print(f"Created image DataFrame: {image_df.shape}")
        print(f"Classes found: {image_df['class'].unique()}")
        print("\nSample data:")
        print(image_df.head())

        # Show class distribution
        print("\nClass distribution:")
        print(image_df['class'].value_counts())
except Exception as e:
    print(f"Could not create image DataFrame: {e}")

Created image DataFrame: (2357, 3)
Classes found: ['nevus' 'seborrheic keratosis' 'pigmented benign keratosis' 'melanoma'
 'basal cell carcinoma' 'actinic keratosis' 'vascular lesion'
 'dermatofibroma' 'squamous cell carcinoma']

Sample data:
                                          image_path  class          filename
0  /root/.cache/kagglehub/datasets/nodoubttome/sk...  nevus  ISIC_0000016.jpg
1  /root/.cache/kagglehub/datasets/nodoubttome/sk...  nevus  ISIC_0000007.jpg
2  /root/.cache/kagglehub/datasets/nodoubttome/sk...  nevus  ISIC_0000014.jpg
3  /root/.cache/kagglehub/datasets/nodoubttome/sk...  nevus  ISIC_0000003.jpg
4  /root/.cache/kagglehub/datasets/nodoubttome/sk...  nevus  ISIC_0000006.jpg

Class distribution:
class
pigmented benign keratosis    478
melanoma                      454
basal cell carcinoma          392
nevus                         373
squamous cell carcinoma       197
vascular lesion               142
actinic keratosis             130
dermatofibroma          

# SIIM ISIC Data

In [None]:
# Download the ISIC 2020 resized dataset (256x256)
ISIC_2020 = kagglehub.dataset_download("nischaydnk/isic-2020-jpg-256x256-resized")
print("ISIC 2020 (256x256) dataset downloaded to:", ISIC_2020)

ISIC 2020 (256x256) dataset downloaded to: /root/.cache/kagglehub/datasets/nischaydnk/isic-2020-jpg-256x256-resized/versions/1


# PREPRCESSING RUN ALL

In [None]:
csv_files = []
for root, dirs, files in os.walk(ISIC_2020):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print("Found CSV files:")
for csv_file in csv_files:
    print(f"- {csv_file}")


# Load the CSV file into a new variable name to avoid conflict
if csv_files:
    metadata_df = pd.read_csv(csv_files[0]) # Keep metadata_df name for the ISIC_2020 metadata
    print(f"\nLoaded dataset shape: {metadata_df.shape}")
    print(f"Columns: {list(metadata_df.columns)}")
    print("\nFirst few rows:")
    print(metadata_df.head())
else:
    print("No CSV files found for ISIC 2020.")
    metadata_df = pd.DataFrame() # Create empty DataFrame if no CSV found


Found CSV files:
- /root/.cache/kagglehub/datasets/nischaydnk/isic-2020-jpg-256x256-resized/versions/1/train-metadata.csv

Loaded dataset shape: (33126, 4)
Columns: ['Unnamed: 0', 'isic_id', 'patient_id', 'target']

First few rows:
   Unnamed: 0       isic_id  patient_id  target
0           0  ISIC_2637011  IP_7279968       0
1           1  ISIC_0015719  IP_3075186       0
2           2  ISIC_0052212  IP_2842074       0
3           3  ISIC_0068279  IP_6890425       0
4           4  ISIC_0074268  IP_8723313       0


In [None]:
# Join the two datasets

# Define a mapping for the first dataset's classes to a binary target
class_to_target = {
    'nevus': 0,
    'seborrheic keratosis': 0,
    'pigmented benign keratosis': 0,
    'melanoma': 1,
    'basal cell carcinoma': 1,
    'actinic keratosis': 0, # Actinic keratosis is pre-malignant, but often treated as benign in datasets
    'vascular lesion': 0,
    'dermatofibroma': 0,
    'squamous cell carcinoma': 1
}

# Use the image_df created from folder structure (cell 1VvSVQBHh1m_) for the first dataset
# Apply the mapping to the first dataset
# Ensure image_df from cell 1VvSVQBHh1m_ is used and it has the 'class' column
if 'class' in image_df.columns:
    image_df['target'] = image_df['class'].map(class_to_target)
    # Keep only relevant columns for merging and add source column
    image_df_subset = image_df[['image_path', 'target']].copy()
    image_df_subset.rename(columns={'target': 'label'}, inplace=True) # Rename to avoid confusion with the second dataset's 'target'
    image_df_subset['source'] = 'ISIC' # Add source column
else:
    # This case should ideally not happen if image_df from folder structure is used
    print("Error: 'class' column not found in the image_df from folder structure. Cannot create binary labels for merging.")
    # Create an empty DataFrame with the expected columns to avoid errors during concatenation
    image_df_subset = pd.DataFrame(columns=['image_path', 'label', 'source'])


# For the second dataset, we already have the 'target' column (0 or 1) in metadata_df
if 'target' in metadata_df.columns and 'isic_id' in metadata_df.columns:
    # Construct image paths using 'isic_id' and the base path of the ISIC 2020 dataset
    metadata_df_subset = metadata_df[['isic_id', 'target']].copy()
    # Assuming images are in a 'train' subdir and are .jpg within the ISIC_2020 path
    metadata_df_subset['image_path'] = metadata_df_subset['isic_id'].apply(lambda x: os.path.join(ISIC_2020, 'train', x + '.jpg'))
    metadata_df_subset.rename(columns={'target': 'label'}, inplace=True) # Rename to match the first dataset subset
    metadata_df_subset = metadata_df_subset[['image_path', 'label']] # Reorder columns
    metadata_df_subset['source'] = 'ISIC_2020' # Add source column

else:
    print("Warning: 'target' or 'isic_id' column not found in the second dataset. Cannot use its labels for merging.")
    # Create an empty DataFrame with the expected columns to avoid errors during concatenation
    metadata_df_subset = pd.DataFrame(columns=['image_path', 'label', 'source'])


# Concatenate the two dataframes
# Ensure both dataframes have the 'image_path', 'label', and 'source' columns before concatenating
combined_df = pd.concat([image_df_subset, metadata_df_subset], ignore_index=True)

# Drop rows where 'label' is NaN to ensure no missing labels
combined_df.dropna(subset=['label'], inplace=True)

print("Combined DataFrame shape:", combined_df.shape)
print("Combined DataFrame head:")
display(combined_df.head())

# Display class distribution in the combined dataset
print("\nClass distribution in combined dataset:")
print(combined_df['label'].value_counts())

# You can now use 'combined_df' for further preprocessing and model training

Combined DataFrame shape: (35483, 3)
Combined DataFrame head:


Unnamed: 0,image_path,label,source
0,/root/.cache/kagglehub/datasets/nodoubttome/sk...,0,ISIC
1,/root/.cache/kagglehub/datasets/nodoubttome/sk...,0,ISIC
2,/root/.cache/kagglehub/datasets/nodoubttome/sk...,0,ISIC
3,/root/.cache/kagglehub/datasets/nodoubttome/sk...,0,ISIC
4,/root/.cache/kagglehub/datasets/nodoubttome/sk...,0,ISIC



Class distribution in combined dataset:
label
0    33856
1     1627
Name: count, dtype: int64


In [None]:
import tensorflow as tf
import os

# 0 = Benign, 1 = Malignant

# Preprocessing

# Split the data into training and validation sets using the combined_df
train_df, val_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['label'], random_state=42)

# Define image size
IMG_WIDTH = 256
IMG_HEIGHT = 256

# Function to load and preprocess images
def load_image(image_path, label):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3) # Decode JPG image
    img = tf.image.convert_image_dtype(img, tf.float32) # Convert to float [0, 1]
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH]) # Resize image
    return img, label

# Create tf.data.Dataset from dataframes
train_ds = tf.data.Dataset.from_tensor_slices((train_df['image_path'], train_df['label']))
val_ds = tf.data.Dataset.from_tensor_slices((val_df['image_path'], val_df['label']))

# Apply the loading and preprocessing function to the datasets
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.map(load_image, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(load_image, num_parallel_calls=AUTOTUNE)

# Configure datasets for performance
BATCH_SIZE = 32

train_ds = train_ds.cache().shuffle(1000).batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)


# Balance dataset (calculate class weights based on the combined training data)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)

# Data augmentation
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1)
])

train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y), num_parallel_calls=AUTOTUNE)

Class weights: {0: np.float64(0.5240363314133806), 1: np.float64(10.900921658986174)}


In [None]:
def load_and_preprocess_image_for_cnn(row, target_size=(256, 256)):
    """Loads, decodes, and preprocesses an image for CNN input based on source."""
    image_path = row['image_path']
    source = row['source']

    # Determine the correct base path based on the source
    if source == 'ISIC':
        base_path = ISIC
    elif source == 'ISIC_2020':
        base_path = ISIC_2020
    else:
        print(f"Warning: Unknown source '{source}' for image '{image_path}'. Cannot load.")
        return None

    # Construct the full image path
    full_image_path = os.path.join(base_path, os.path.basename(image_path))


    try:
        img = Image.open(full_image_path).convert('RGB')  # Ensure image is in RGB format
        img = img.resize(target_size)
        return np.array(img)
    except Exception as e:
        print(f"Error loading image {full_image_path}: {e}")
        return None

# Define the target size for the images
target_size = (IMG_WIDTH, IMG_HEIGHT)

In [None]:
# Use combined_df for splitting
X = combined_df['image_path']
y = combined_df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

Training set size: 24838
Testing set size: 10645


# PREPROCESSING END

# EfficientNet CNN Model

In [None]:
# Import EfficientNet
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import AUC

def create_efficientnet_model():
    # Load EfficientNetB0 with pre-trained weights
    base_model = EfficientNetB0(
        weights='imagenet',
        include_top=False,
        input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)
    )
    
    # Freeze the base model initially
    base_model.trainable = False
    
    # Create the model
    model = tf.keras.Sequential([
        # Data augmentation layers (already defined in preprocessing)
        # Base EfficientNet model
        base_model,
        # Add custom layers
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    
    return model

# Create the model
model = create_efficientnet_model()

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=['accuracy', AUC(name='auc')]
)

# Display model summary
model.summary()

KeyboardInterrupt: 

In [None]:
# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_auc',
        mode='max',
        patience=5,
        restore_best_weights=True
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=3,
        min_lr=1e-6
    )
]

# Train the model
epochs = 20

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks,
    class_weight=class_weights
)

# Save the model
model.save('efficientnet_skin_cancer.h5')

In [None]:
# Plot training history
plt.figure(figsize=(15, 5))

# Plot training & validation accuracy
plt.subplot(1, 3, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

# Plot training & validation loss
plt.subplot(1, 3, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

# Plot AUC
plt.subplot(1, 3, 3)
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.tight_layout()
plt.show()

In [None]:
# Evaluate the model on the test set
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns

# Create test dataset
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_ds = test_ds.map(load_image, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

# Get predictions
y_pred_proba = model.predict(test_ds)
y_pred = (y_pred_proba > 0.5).astype(int)
y_test_array = np.array(list(y_test))

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test_array, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_array, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test_array, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()