In [1]:
# Imports

import os
import shutil
from random import shuffle
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import plotly.express as px

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, WeightedRandomSampler
import torchvision
from torchvision import transforms as T
import pytorch_lightning as pl
from pytorch_lightning.metrics.classification import Accuracy
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

In [2]:
# Getting our images in the right folders so we can use ImageFolder

project_path = os.getcwd().replace('/notebooks', '')
csv_path = project_path + '/data/HAM10000_metadata'
metadata = pd.read_csv(csv_path)

lesion_ids = metadata['lesion_id'].unique()
shuffle(lesion_ids)
lesion_ids = list(lesion_ids)

val_size = 0.15
test_size = 0.15

def set_membership(lesion_id):
    idx = lesion_ids.index(lesion_id)
    if idx < test_size*len(lesion_ids):
        return 'test'
    elif idx < (test_size+val_size)*len(lesion_ids):
        return 'val'
    else:
        return 'train'

for i in ['train', 'val', 'test']:
    path = project_path + '/data/images/' + i
    if not os.path.exists(path):
        os.mkdir(path)

for index, row in metadata.iterrows():
    destination_set = set_membership(row['lesion_id'])
    source = project_path + '/data/images/' + row['image_id'] + '.jpg'
    destination = project_path + '/data/images/' + destination_set + '/' + row['dx'] + '/' + row['image_id'] + '.jpg'
    destination_folder = project_path + '/data/images/' + destination_set + '/' + row['dx']
    if not os.path.exists(destination_folder):
        os.mkdir(destination_folder)
    if os.path.exists(source):
        shutil.move(source, destination)

In [3]:
# Setting up our dataloaders

labels = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

def estimate_weights_mfb(labels):
    frequencies = []
    for label in labels:
        count = len(metadata[metadata['dx'] == label]['dx'])
        frequencies.append(count)
    median_freq = np.median(frequencies)
    weights = [median_freq/freq for freq in frequencies]
    return weights

def sampler_weights(dataset, class_weights):
    weights = []
    for img, label in dataset:
        weights.append(class_weights[label])
    return torch.tensor(weights)

train_transform = T.Compose([
    T.ToTensor(),
    T.CenterCrop((448, 576)),
    T.Resize((224, 288)),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomRotation(90),
    T.RandomPerspective(distortion_scale=0.6, p=1.0),
    T.RandomGrayscale(p=0.1),
])

test_transform = T.Compose([
    T.ToTensor(),
    T.CenterCrop((448, 576)),
    T.Resize((224, 288)),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

batch_size = 12
num_workers = 8
class_weights = estimate_weights_mfb(labels)
train_ds = torchvision.datasets.ImageFolder(project_path + '/data/images/train', transform=train_transform)
val_ds = torchvision.datasets.ImageFolder(project_path + '/data/images/val', transform=test_transform)
test_ds = torchvision.datasets.ImageFolder(project_path + '/data/images/test', transform=test_transform)
train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=num_workers, sampler=WeightedRandomSampler(sampler_weights(train_ds, class_weights), len(train_ds), replacement=True))
val_loader = DataLoader(val_ds, batch_size=batch_size, num_workers=num_workers, sampler=WeightedRandomSampler(sampler_weights(val_ds, class_weights), len(val_ds), replacement=True))
test_loader = DataLoader(test_ds, batch_size=batch_size, num_workers=num_workers, sampler=WeightedRandomSampler(sampler_weights(test_ds, class_weights), len(test_ds), replacement=True))

In [None]:
# Architecture specification

class Baseline(pl.LightningModule):
    def __init__(self, lr=1e-3, dropout=0.2):
        super().__init__()
        self.lr = lr
        self.accuracy = Accuracy()

        self.NN = nn.Sequential(
            nn.Conv2d(3, 16, 5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(dropout),
            nn.Conv2d(16, 32, 5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(dropout),
            nn.Conv2d(32, 64, 5, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(dropout),
            nn.Conv2d(64, 64, 5, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout(dropout),
            nn.Flatten(),
            nn.Linear(64*7*9, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 7)
        )

    def forward(self, x):
        return self.NN(x)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)
        return {
           'optimizer': optimizer,
           'lr_scheduler': scheduler,
           'monitor': 'val_loss'
        }

    def training_step(self, train_batch, batch_idx):
        x, y_true = train_batch
        y_pred = self.NN(x)
        loss = F.cross_entropy(y_pred, y_true)
        self.log('train_loss', loss, prog_bar=False)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y_true = val_batch
        y_pred = self.NN(x)
        loss = F.cross_entropy(y_pred, y_true.view(-1))
        acc = self.accuracy(torch.argmax(y_pred, dim=1), y_true.view(-1))
        self.log('val_loss', loss, prog_bar=True)
        self.log('accuracy', acc, prog_bar=True)

In [None]:
# Init model and trainer

logger = CSVLogger("logs", name="LeNetLike", version='1.0')
callbacks = [
    EarlyStopping('accuracy', patience=5),
    LearningRateMonitor(logging_interval='step'),
    ]
trainer = pl.Trainer(gpus=1, auto_lr_find=False, max_epochs=20, logger=logger, callbacks=callbacks)
model = Baseline(lr=1e-3)

In [None]:
# Optionally use lrfinder

# lr_finder = trainer.tuner.lr_find(model, train_loader, val_loader, update_attr=True)
# print(f'Suggested LR: {lr_finder.suggestion()}')
# fig = lr_finder.plot(suggest=True)
# fig.show()

In [None]:
# Train

trainer.fit(model, train_loader, val_loader)

In [None]:
# Visualize losses/metrics

log_dir = project_path + '/notebooks/logs/LeNetLike/1.0/'
log = pd.read_csv(os.path.join(log_dir, 'metrics.csv'))

train_log = log[['step', 'train_loss']].dropna().rename(columns={'train_loss': 'loss'})
train_log['set'] = 'train'
val_log = log[['step', 'val_loss']].dropna().rename(columns={'val_loss': 'loss'})
val_log['set'] = 'val'

log = train_log.append(val_log)
log = log.sort_values(by=['step'])

fig = px.line(log, x='step', y='loss', color='set')
fig.show()

In [None]:
# Load a model we want to check out.

path = project_path + '/models/LeNetLike.pth'
#model.load_state_dict(torch.load(path))

In [None]:
# Plot a confusion matrix

y_pred = []
y_true = []

# iterate over validation data
model.eval()
with torch.no_grad():
        for inputs, labels in val_loader:
                logits = model(inputs) # Feed Network

                output = (torch.max(torch.exp(logits), 1))[1].cpu().numpy()
                y_pred.extend(output) # Save Prediction
                
                labels = labels.data.cpu().numpy()
                y_true.extend(labels) # Save Truth

# constant for classes
classes = ('akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc')

# Build confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)
norm_cf_matrix = np.zeros((7, 7))
for idx, row in enumerate(cf_matrix):
    row = row/row.sum()*100.0
    norm_cf_matrix[idx] = row
df_cm = pd.DataFrame(norm_cf_matrix, index = [i for i in classes], columns = [i for i in classes])
sns.heatmap(df_cm, annot=True, square=True, vmin=0.0, vmax=100.0)

In [None]:
# Save the model if we like it.

path = project_path + '/models/LeNetLike.pth'
#torch.save(model.state_dict(), path)

In [4]:
# ResNet time

class ResNet(pl.LightningModule):
    def __init__(self, lr=1e-3, freeze=True):
        super().__init__()
        self.lr = lr
        self.accuracy = Accuracy()
        self.accuracy2 = Accuracy(top_k=2)
        self.accuracy3 = Accuracy(top_k=3)

        self.resnet = torchvision.models.resnet50(pretrained=True)
        if freeze:
            for layer in self.resnet.named_children():
                layer[1].requires_grad = False
        self.resnet.fc = nn.Sequential(
            nn.Dropout(0.9),
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(512, 7)
        )

    def forward(self, x):
        return self.resnet(x)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2)
        return {
           'optimizer': optimizer,
           'lr_scheduler': scheduler,
           'monitor': 'val_loss'
        }

    def training_step(self, train_batch, batch_idx):
        x, y_true = train_batch
        y_pred = self.resnet(x)
        loss = F.cross_entropy(y_pred, y_true)
        acc = self.accuracy(F.softmax(y_pred, dim=1), y_true.view(-1))
        self.log('train_loss', loss, prog_bar=False, on_step=False, on_epoch=True)
        self.log('train_accuracy', acc, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y_true = val_batch
        y_pred = self.resnet(x)
        loss = F.cross_entropy(y_pred, y_true.view(-1))
        acc = self.accuracy(F.softmax(y_pred, dim=1), y_true.view(-1))
        acc2 = self.accuracy2(F.softmax(y_pred, dim=1), y_true.view(-1))
        acc3 = self.accuracy3(F.softmax(y_pred, dim=1), y_true.view(-1))
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_accuracy', acc, prog_bar=True)
        self.log('top_2_val_accuracy', acc2, prog_bar=True)
        self.log('top_3_val_accuracy', acc3, prog_bar=True)

In [8]:
# Init model and trainer

logger = CSVLogger("logs", name="ResNet", version='1.0')
callbacks = [
    EarlyStopping('val_accuracy', mode='max', patience=5),
    LearningRateMonitor(logging_interval='step'),
    ]
trainer = pl.Trainer(gpus=1, auto_lr_find=False, max_epochs=50, logger=logger, callbacks=callbacks, log_every_n_steps=10)
model = ResNet(lr=1e-5, freeze=True)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
# Optionally use lrfinder

# lr_finder = trainer.tuner.lr_find(model, train_loader, val_loader, update_attr=True)
# print(f'Suggested LR: {lr_finder.suggestion()}')
# fig = lr_finder.plot(suggest=True)
# fig.show()

In [9]:
# Train

trainer.fit(model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

Experiment logs directory logs/ResNet/1.0 exists and is not empty. Previous log files in this directory will be deleted when the new ones are saved!


  | Name      | Type     | Params
---------------------------------------
0 | accuracy  | Accuracy | 0     
1 | accuracy2 | Accuracy | 0     
2 | accuracy3 | Accuracy | 0     
3 | resnet    | ResNet   | 24.6 M
---------------------------------------
24.6 M    Trainable params
0         Non-trainable params
24.6 M    Total params
98.243    Total estimated model params size (MB)


Epoch 11: 100%|██████████| 710/710 [02:35<00:00,  4.55it/s, loss=0.54, v_num=1.0, val_loss=0.870, val_accuracy=0.690, top_2_val_accuracy=0.850, top_3_val_accuracy=0.904, train_accuracy=0.781]


In [10]:
# Visualize losses/metrics

log_dir = project_path + '/notebooks/logs/ResNet/1.0/'
log = pd.read_csv(os.path.join(log_dir, 'metrics.csv'))

train_log = log[['step', 'train_loss']].dropna().rename(columns={'train_loss': 'loss'})
train_log['set'] = 'train'
val_log = log[['step', 'val_loss']].dropna().rename(columns={'val_loss': 'loss'})
val_log['set'] = 'val'

log = train_log.append(val_log)
log = log.sort_values(by=['step'])

fig = px.line(log, x='step', y='loss', color='set')
fig.show()

log = pd.read_csv(os.path.join(log_dir, 'metrics.csv'))

train_log = log[['step', 'train_accuracy']].dropna().rename(columns={'train_accuracy': 'accuracy'})
train_log['set'] = 'train'
val_log = log[['step', 'val_accuracy']].dropna().rename(columns={'val_accuracy': 'accuracy'})
val_log['set'] = 'val'
val_log2 = log[['step', 'top_2_val_accuracy']].dropna().rename(columns={'top_2_val_accuracy': 'accuracy'})
val_log2['set'] = 'top 2 val'
val_log3 = log[['step', 'top_3_val_accuracy']].dropna().rename(columns={'top_3_val_accuracy': 'accuracy'})
val_log3['set'] = 'top 3 val'

log = train_log.append(val_log).append(val_log2).append(val_log3)
log = log.sort_values(by=['step'])

fig = px.line(log, x='step', y='accuracy', color='set')
fig.show()

In [None]:
# Load a model we want to check out.

path = project_path + '/models/ResNet.pth'
#model.load_state_dict(torch.load(path))

In [None]:
# Plot a confusion matrix

y_pred = []
y_true = []

# iterate over validation data
model.eval()
with torch.no_grad():
        for inputs, labels in val_loader:
                logits = model(inputs) # Feed Network

                output = (torch.max(torch.exp(logits), 1))[1].cpu().numpy()
                y_pred.extend(output) # Save Prediction
                
                labels = labels.data.cpu().numpy()
                y_true.extend(labels) # Save Truth

# constant for classes
classes = ('akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc')

# Build confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)
norm_cf_matrix = np.zeros((7, 7))
for idx, row in enumerate(cf_matrix):
    row = row/row.sum()*100.0
    norm_cf_matrix[idx] = row
df_cm = pd.DataFrame(norm_cf_matrix, index = [i for i in classes], columns = [i for i in classes])
sns.heatmap(df_cm, annot=True, square=True, vmin=0.0, vmax=100.0)

In [None]:
# Save the model if we like it.

path = project_path + '/models/ResNet.pth'
#torch.save(model.state_dict(), path)