In [None]:
!pip install -q kaggle

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = 'kyriacospanas'  # Your Kaggle username
os.environ['KAGGLE_KEY'] = '1bbf35726a612dad5d46ed41bcbb5a4f'  # Your Kaggle API key

In [None]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000
!kaggle datasets download -d kyriacospanas/ham10000-test

Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.20G/5.20G [03:49<00:00, 24.6MB/s]
100% 5.20G/5.20G [03:49<00:00, 24.3MB/s]
Downloading ham10000-test.zip to /content
100% 400M/401M [00:18<00:00, 25.4MB/s]
100% 401M/401M [00:18<00:00, 23.0MB/s]


In [None]:
!unzip -q ham10000-test.zip

In [None]:
!unzip -q skin-cancer-mnist-ham10000.zip


In [None]:
import shutil

# Replace 'my_directory' with the name of the directory you want to delete
dir_path = '/content/ham10000_images_part_1'

shutil.rmtree(dir_path)

dir_path = '/content/ham10000_images_part_2'

shutil.rmtree(dir_path)

In [None]:
import os
from glob import glob

training_data_dir = '../content'

all_image_path = []
for root, dirs, files in os.walk(training_data_dir):
    for file in files:
        if file.endswith('.jpg'):
            all_image_path.append(os.path.join(root, file))

imageID_path_dict = {}
for path in all_image_path:
    filename = os.path.splitext(os.path.basename(path))[0]
    imageID_path_dict[filename] = path

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}


In [None]:
lesion_type_dict_capital = {
    'NV': 'Melanocytic nevi',
    'MEL': 'melanoma',
    'BKL': 'Benign keratosis-like lesions ',
    'BCC': 'Basal cell carcinoma',
    'AKIEC': 'Actinic keratoses',
    'VASC': 'Vascular lesions',
    'DF': 'Dermatofibroma'
}

In [None]:
import os, cv2,itertools
from tqdm import tqdm
import numpy as np

def calculate_image_mean_std(image_paths):

    image_height, image_width = 224, 224
    images = []
    means = []
    stdevs = []

    for i in tqdm(range(len(image_paths))):
        image = cv2.imread(image_paths[i])
        image = cv2.resize(image, (image_height, image_width))
        images.append(image)

    images = np.stack(images, axis=3)
    print(images.shape)

    images = images.astype(np.float32) / 255.

    for i in range(3):
        pixels = images[:, :, i, :].ravel()
        mean = np.mean(pixels)
        std = np.std(pixels)
        means.append(mean)
        stdevs.append(std)

    means.reverse()
    stdevs.reverse()

    print("normalised Mean values = {}".format(means))
    print("normalised Std values = {}".format(stdevs))

    return means,stdevs

In [None]:
norm_mean,norm_std = calculate_image_mean_std(all_image_path)

100%|██████████| 11527/11527 [01:51<00:00, 103.11it/s]


(224, 224, 3, 11527)


In [None]:
import pandas as pd

df = pd.read_csv(os.path.join('/content', 'HAM10000_metadata.csv'))

df['file_path'] = df['image_id'].map(imageID_path_dict.get)
df['cell_type'] = df['dx'].map(lesion_type_dict.get)
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

df.head()

In [None]:
df.isnull().sum()

In [None]:
df['age'].fillna(int(df['age'].mean()),inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df['cell_type'].value_counts()

In [None]:
df['cell_type_idx'].value_counts()

In [None]:
df_balanced = df

# Copy fewer class to balance the number of 7 classes
data_aug_rate = [15,10,5,55,0,45,5]
for i in range(7):
    if data_aug_rate[i]:
        df_balanced=df_balanced.append([df.loc[df['cell_type_idx'] == i,:]]*(data_aug_rate[i]-1), ignore_index=True)
df_balanced['cell_type'].value_counts()

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameter
batch_size = 128
learning_rate = 1e-3
num_epochs = 5

In [None]:
import torchvision
import torch.nn as nn

model = torchvision.models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, 7).to(device)
model.to(device)

In [None]:
from torchvision import models,transforms
# norm_mean = (0.49139968, 0.48215827, 0.44653124)
# norm_std = (0.24703233, 0.24348505, 0.26158768)
# define the transformation of the train images.
train_transform = transforms.Compose([transforms.Resize((224,224)),transforms.RandomHorizontalFlip(),
                                      transforms.RandomVerticalFlip(),transforms.RandomRotation(20),
                                      transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1),
                                        transforms.ToTensor(), transforms.Normalize(norm_mean, norm_std)])

val_transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(),
                                    transforms.Normalize(norm_mean, norm_std)])


In [None]:
from torch.utils.data import DataLoader,Dataset
from PIL import Image

# Define a pytorch dataloader for this dataset
class HAM10000(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Load data and get label
        X = Image.open(self.df['file_path'][index])
        y = torch.tensor(int(self.df['cell_type_idx'][index]))

        if self.transform:
            X = self.transform(X)

        return (X, y)

In [None]:
from torch import optim,nn

# we use Adam optimizer, use cross entropy loss as our loss function
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
df_balanced.head()

In [None]:
num_cells = df_balanced.size
print("Number of cells in df_balanced:", num_cells)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def validate(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
df_balanced.isnull().sum()

In [None]:
from sklearn.model_selection import KFold
from torch import optim,nn

# Define the number of folds
num_folds = 2

# Use KFold from scikit-learn to split the data into k-folds
kfold = KFold(n_splits=num_folds, shuffle=True)

# Loop over the k-folds
for fold, (train_idx, val_idx) in enumerate(kfold.split(df_balanced)):
    # Create training and validation sets for this fold
    train_set = df_balanced.iloc[train_idx].reset_index(drop=True)
    val_set = df_balanced.iloc[val_idx].reset_index(drop=True)

    print("Length of Df:", len(df_balanced))
    print("Length of train_set:", len(train_set))
    print("Length of val_set:", len(val_set))

    # Create data loaders for this fold
    training_set = HAM10000(train_set, transform=train_transform)
    train_loader = DataLoader(training_set, batch_size=32, shuffle=True)

    validation_set = HAM10000(val_set, transform=val_transform)
    val_loader = DataLoader(validation_set, batch_size=32, shuffle=False)

    # train your model on the current fold
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss().to(device)

    print("Fold:", fold + 1)
    epochs = 2
    for t in range(epochs):
      print(f"Epoch {t+1}\n-------------------------------")
      train(train_loader, model, criterion, optimizer)
      validate(val_loader, model, criterion)
      print("Done!")

In [None]:
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Define a pytorch dataloader for this dataset
class SkinTestDf(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Load data and get label
        X = Image.open(self.df['file_path'][index])
        y = torch.tensor(int(self.df['cell_type_idx'][index]))

        if self.transform:
            X = self.transform(X)

        return X, y

In [None]:
cell_type_idx = []
dfTest = pd.read_csv('ISIC2018_Task3_Test_GroundTruth.csv')
for index, row in dfTest.iterrows():
    # Access values by column name
    if 1 in row.values:
      columns_with_1 = row[row == 1].index.tolist()
      dfTest.at[index, 'columns_with_1'] = ', '.join(columns_with_1)
      dfTest['file_path'] = dfTest['image'].map(imageID_path_dict.get)
      dfTest['cell_type'] = dfTest['columns_with_1'].map(lesion_type_dict_capital.get)
      dfTest['cell_type_idx'] = pd.Categorical(dfTest['cell_type']).codes


In [None]:
dfTest.head()

In [None]:
test_transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor(),transforms.Normalize(norm_mean, norm_std)])

In [None]:
test_dataset = SkinTestDf(dfTest, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
from sklearn.metrics import confusion_matrix
model.eval()
y_label = []
y_predict = []
with torch.no_grad():
    for i, data in enumerate(test_loader):
        images, labels = data
        images = images.to(device)
        outputs = model(images)
        _, prediction = torch.max(outputs, 1)
        y_label.extend([labels.item()])
        y_predict.extend(prediction.cpu().numpy())

# compute the confusion matrix
confusion_mtx = confusion_matrix(y_label, y_predict)
# plot the confusion matrix
plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'nv', 'vasc','mel']
plot_confusion_matrix(confusion_mtx, plot_labels)

In [None]:
from sklearn.metrics import classification_report
# Generate a classification report
report = classification_report(y_label, y_predict, target_names=plot_labels)
print(report)

In [None]:
label_frac_error = 1 - np.diag(confusion_mtx) / np.sum(confusion_mtx, axis=1)
plt.bar(np.arange(7),label_frac_error)
plt.xlabel('True Label')
plt.ylabel('Fraction classified incorrectly')