In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import shutil
import matplotlib
from matplotlib import pyplot as plt

### Data PreProcessing

In [2]:
# retrieve csv metadata of the HAM10000
df1 = pd.read_csv("ISIC-images/metadata.csv")
df1.head()

  df1 = pd.read_csv("ISIC-images/metadata.csv")


Unnamed: 0,isic_id,attribution,copyright_license,acquisition_day,age_approx,anatom_site_general,anatom_site_special,benign_malignant,clin_size_long_diam_mm,concomitant_biopsy,...,mel_class,mel_mitotic_index,mel_thick_mm,mel_type,mel_ulcer,melanocytic,nevus_type,patient_id,personal_hx_mm,sex
0,ISIC_0015719,The University of Queensland Diamantina Instit...,CC-BY,556.0,45.0,upper extremity,,benign,,False,...,,,,,,,,IP_3075186,True,female
1,ISIC_0052212,"ViDIR Group, Department of Dermatology, Medica...",CC-BY-NC,366.0,50.0,lower extremity,,benign,,False,...,,,,,,True,,IP_2842074,,female
2,ISIC_0068279,"Department of Dermatology, Hospital Clínic de ...",CC-BY-NC,,45.0,head/neck,,benign,,False,...,,,,,,,,IP_6890425,,female
3,ISIC_0074268,The University of Queensland Diamantina Instit...,CC-BY,659.0,55.0,upper extremity,,benign,,False,...,,,,,,,,IP_8723313,True,female
4,ISIC_0074311,The University of Queensland Diamantina Instit...,CC-BY,358.0,45.0,lower extremity,,benign,,False,...,,,,,,,,IP_2950485,True,female


In [3]:
df1.shape

(33126, 32)

In [4]:
# remmove unnecessary columns
df2 = df1[['isic_id','benign_malignant']]

df2.head()

Unnamed: 0,isic_id,benign_malignant
0,ISIC_0015719,benign
1,ISIC_0052212,benign
2,ISIC_0068279,benign
3,ISIC_0074268,benign
4,ISIC_0074311,benign


In [5]:
# check for duplicates
duplicates = df2.duplicated()
print(duplicates.sum(), "duplicate rows found.")
df3 = df2.drop_duplicates()
df3.head()

0 duplicate rows found.


Unnamed: 0,isic_id,benign_malignant
0,ISIC_0015719,benign
1,ISIC_0052212,benign
2,ISIC_0068279,benign
3,ISIC_0074268,benign
4,ISIC_0074311,benign


In [6]:
# chekc for null values
print(df3.isnull().sum())

isic_id             0
benign_malignant    0
dtype: int64


In [7]:
# check for all unique values of the diagnosis section of the dataset
df3["benign_malignant"].unique()

array(['benign', 'malignant'], dtype=object)

In [8]:
print(df3["benign_malignant"].value_counts())

benign_malignant
benign       32542
malignant      584
Name: count, dtype: int64


In [9]:
# Make benign_malignant numnerical values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df3['class_label'] = le.fit_transform(df3['benign_malignant'])

df3.head()

Unnamed: 0,isic_id,benign_malignant,class_label
0,ISIC_0015719,benign,0
1,ISIC_0052212,benign,0
2,ISIC_0068279,benign,0
3,ISIC_0074268,benign,0
4,ISIC_0074311,benign,0


In [10]:
# Check the mapping
mapping = {index: label for index, label in enumerate(le.classes_)}
print(mapping)

{0: 'benign', 1: 'malignant'}


In [11]:
# export mapping for later use
import json

with open('class_mapping.json', 'w') as f:
    json.dump(mapping, f)

In [12]:
df3 = df3.drop(['benign_malignant'], axis='columns')
df3.head()

Unnamed: 0,isic_id,class_label
0,ISIC_0015719,0
1,ISIC_0052212,0
2,ISIC_0068279,0
3,ISIC_0074268,0
4,ISIC_0074311,0


In [13]:
# split train
from sklearn.model_selection import train_test_split

X = df3['isic_id'] 
y = df3['class_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Create DataFrames for training and testing sets
train_df = pd.DataFrame({'isic_id': X_train, 'class_label': y_train})
test_df = pd.DataFrame({'isic_id': X_test, 'class_label': y_test})

# Print the resulting DataFrames
print("train_df:\n", train_df)
print("test_df:\n", test_df) 

# Print class labels
print("train_df:\n", train_df["class_label"].unique())
print("test_df:\n", test_df["class_label"].unique()) 



train_df:
             isic_id  class_label
22826  ISIC_6933350            0
20292  ISIC_6168551            0
29431  ISIC_8896474            0
15395  ISIC_4710686            0
2333   ISIC_0797476            0
...             ...          ...
16850  ISIC_5151428            0
6265   ISIC_1972876            0
11284  ISIC_3476509            0
860    ISIC_0361341            0
15795  ISIC_4831895            0

[26500 rows x 2 columns]
test_df:
             isic_id  class_label
8231   ISIC_2566365            0
21862  ISIC_6617618            0
3058   ISIC_1018265            0
7474   ISIC_2335533            0
3470   ISIC_1151910            0
...             ...          ...
2099   ISIC_0727322            0
7420   ISIC_2319387            0
26025  ISIC_7900295            0
24915  ISIC_7564393            0
27017  ISIC_8185610            0

[6626 rows x 2 columns]
train_df:
 [0 1]
test_df:
 [0 1]


### Pytorch 


In [14]:
# Import for Pytorch
import torch
from torch import nn, optim
from torchvision import datasets, models
from torchvision.transforms import v2 as T
from torch.utils.data import DataLoader, Dataset
from PIL import Image

print(torch.__version__)

# check if GPU is available
torch.cuda.is_available()

2.5.1


False

In [15]:
transform = T.Compose([
    T.Resize((224, 224)),  # Resize to match model input
    T.ToImage(),  # Convert PIL to TorchImage
    T.RandomHorizontalFlip(p=0.5),  # Random horizontal flip
    T.RandomRotation(degrees=15),  # Random rotation
    T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),  # Adjust lighting
    T.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),  # Add blur to mimic image compression
    T.RandomAdjustSharpness(sharpness_factor=2, p=0.5),  # Adjust sharpness
    T.RandomErasing(p=0.5, scale=(0.02, 0.1)),  # Randomly erase parts of the image
    T.ToDtype(torch.float32, scale=True),  # Convert to float tensor with normalization
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize for ResNet
])

In [16]:
# Custom Dataset
class SkinLesionDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.iloc[idx, 0]
        class_label = self.df.iloc[idx, 1]

        img_path = f"ISIC-images/{image_id}.jpg" 

        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)

        return img, class_label
        

In [17]:
# make the data set 
training_data = SkinLesionDataset(
    df = train_df,
    transform = transform
)

test_data = SkinLesionDataset(
    df = test_df,
    transform = transform
)

In [18]:
# load the dataset (batch _size 32 for GPU P100)
train_dataloader = DataLoader(training_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

In [19]:
# incorporate class balancing
from sklearn.utils.class_weight import compute_class_weight

device = torch.device("mps" if torch.has_mps else "cpu")

print(f"Using device: {device}")

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print("Class Weights:", class_weights)

Using device: mps
Class Weights: tensor([ 0.5089, 28.5560], device='mps:0')


  device = torch.device("mps" if torch.has_mps else "cpu")


In [20]:
def train_model(model, train_dataloader, device, number_of_epoch, class_weights=class_weights):
    """
    Function to train the model with the given dataloader.

    Args:
        model (torch.nn.Module): The model to be trained.
        train_dataloader (DataLoader): The DataLoader object for training data.
        device (torch.device): The device to train the model on (CPU or GPU).
        number_of_epoch (int): Number of epochs to train the model for.
        class_weights (tensor, optional): Class weights to handle class imbalance (default is None).
    
    Returns:
        model (torch.nn.Module): The trained model.
    """
    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)
    
    # Train the model (simple training loop)
    model.train()
    for epoch in range(number_of_epoch):  # Use more epochs for real training
        running_loss = 0.0
        for images, labels in train_dataloader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        # Step the learning rate scheduler after each epoch
        scheduler.step(loss)
        
        print(f"Epoch [{epoch+1}/{number_of_epoch}], Loss: {loss.item():.4f}")
    
    return model

In [21]:
def evaluate_model(model, test_dataloader, device):
    """
    Function to evaluate the model on the test data.

    Args:
        model (torch.nn.Module): The trained model.
        test_dataloader (DataLoader): The DataLoader object for test data.
        device (torch.device): The device to evaluate the model on (CPU or GPU).
    
    Returns:
        accuracy (float): Accuracy of the model on the test set.
    """
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in test_dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")
    return accuracy

In [22]:

def test_model(model_name, num_classes, train_dataloader, test_dataloader, device, class_weights, number_of_epoch=10):
    """
    Function to test a model by training and evaluating it.

    Args:
        model_name (str): Name of the model ('resnet18', 'resnet34', 'resnet50', etc.).
        num_classes (int): Number of classes in your dataset.
        train_dataloader (DataLoader): DataLoader for training data.
        test_dataloader (DataLoader): DataLoader for test data.
        device (torch.device): Device to run the model on (CPU or GPU).
        class_weights (tensor): Class weights to handle class imbalance.
        number_of_epoch (int): Number of epochs for training.
    
    Returns:
        accuracy (float): The model's accuracy on the test set.
    """
    print(f"Testing {model_name}...")

    # Load the model based on the model name
    if model_name == 'resnet18':
        model = models.resnet18(pretrained=True)
        model.fc = nn.Linear(model.fc.in_features, num_classes)  # Modify the final layer for your classes
    elif model_name == 'resnet34':
        model = models.resnet34(pretrained=True)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    elif model_name == 'resnet50':
        model = models.resnet50(pretrained=True)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    elif model_name == 'efficientnet_b0':
        model = models.efficientnet_b0(pretrained=True)  # EfficientNet B0 from torchvision
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)  # Modify classifier layer
    elif model_name == 'efficientnet_b3':
        model = models.efficientnet_b3(pretrained=True)  # EfficientNet B3 from torchvision
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    elif model_name == 'densenet121':
        model = models.densenet121(pretrained=True)  # DenseNet 121
        model.classifier = nn.Linear(model.classifier.in_features, num_classes)  # Modify classifier layer
    elif model_name == 'densenet169':
        model = models.densenet169(pretrained=True)  # DenseNet 169
        model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    elif model_name == 'densenet201':
        model = models.densenet201(pretrained=True)  # DenseNet 201
        model.classifier = nn.Linear(model.classifier.in_features, num_classes)
    else:
        raise ValueError(f"Model {model_name} is not supported.")
    
    # Move model to device (GPU or CPU)
    model = model.to(device)

    # Train the model using the train_model function (with class weights)
    model = train_model(model, train_dataloader, device, number_of_epoch, class_weights)

    # Evaluate the model on the test set
    accuracy = evaluate_model(model, test_dataloader, device)
    
    return accuracy

In [23]:
# # List of models to test 'resnet18', 'resnet34', 'resnet50', 'densenet121', 'densenet169', 'densenet201', 'efficientnet_b0', 'efficientnet_b3'
# model_names = [ 'resnet18', 'densenet121', 'efficientnet_b0']

# # Dictionary to store results
# results = {}

# # Assuming the following are defined:
# # - train_dataloader: Your training data loader
# # - test_dataloader: Your testing data loader
# # - device: The device (CPU or GPU) to run the model
# # - class_weights: The computed class weights to handle imbalance
# # - num_classes: Number of classes in the dataset (e.g., 7 for HAM10000)

# # Loop through each model name, call test_model and store the results
# for model_name in model_names:
#     results[model_name] = test_model(
#         model_name=model_name, 
#         num_classes=2,  # Replace with the actual number of classes in your dataset
#         train_dataloader=train_dataloader, 
#         test_dataloader=test_dataloader,
#         device=device,  # Ensure your device is correctly defined (e.g., 'cuda' or 'cpu')
#         class_weights=class_weights,  # Include the class weights here
#         number_of_epoch=10  # Adjust the number of epochs as needed
#     )

# # Print the results for all models
# print("Results:", results)


### Model to train with

Resnet trained to 91%


In [24]:
# Train the model on Resnet50
model = models.resnet50(pretrained=True)  # Resnet50 from torchvision
model.fc = nn.Linear(model.fc.in_features, 2)
# Move model to GPU if available

model = model.to(device)



In [25]:
# Train the model
trained_model = train_model(model, train_dataloader, device, number_of_epoch=10)

# Evaluate the trained model
accuracy = evaluate_model(trained_model, test_dataloader, device)

print(accuracy)


Epoch [1/10], Loss: 0.2595
Epoch [2/10], Loss: 0.3915
Epoch [3/10], Loss: 0.2171
Epoch [4/10], Loss: 0.2725
Epoch [5/10], Loss: 0.2234
Epoch [6/10], Loss: 0.1695
Epoch [7/10], Loss: 0.0924
Epoch [8/10], Loss: 0.2791
Epoch [9/10], Loss: 0.1961
Epoch [10/10], Loss: 0.2903
Accuracy: 0.9105
0.9105040748566254


In [26]:
torch.save(model.state_dict(), "resnet50_skin_cancer.pth")
 