# Hosting models on Grid

Grid offers both: Machine Learning as a Service and Encrypted Machine Learning as a service. This is a series of notebooks showing how you can serve your models on Grid.

## 1. Train a model

First of all we need to train a model. Here we replicate the tutorial from [Nyla Pirani](https://towardsdatascience.com/@nyla.pirani) that shows [how to train a Pytorch model for skin cancer](https://towardsdatascience.com/skin-cancer-classification-with-machine-learning-c9d3445b2163).

### Use case: Skin cancer prediction

Here we'll implement a model for detecting types of skin cancer on images.

### Dataset

We'll use this [kaggle dataset](https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000/). You need to download this dataset before running this example.

In [None]:
from glob import glob
from PIL import Image
import os

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import torch
import torch.utils.data

DATASET_PATH = "./skin-cancer-mnist-ham10000"

In [None]:
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(DATASET_PATH, '*', '*.jpg'))}

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}
tile_df = pd.read_csv(os.path.join(DATASET_PATH, 'HAM10000_metadata.csv'))
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes
tile_df[['cell_type_idx', 'cell_type']].sort_values('cell_type_idx').drop_duplicates()

In [None]:
tile_df['cell_type'].value_counts()

In [None]:
# get 1 sample from each class
samples = tile_df.groupby('cell_type').apply(lambda x: x.sample(1))

In [None]:
fig = plt.figure(figsize=(12, 8))
columns = 3
rows = 2
for i in range(columns * rows):
    image = mpimg.imread(samples["path"].iloc[i])
    fig.add_subplot(rows, columns, i + 1)
    plt.imshow(image)
    title = "{} ({})".format(samples["cell_type_idx"].iloc[i], samples["cell_type"].iloc[i])
    plt.title(title)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(tile_df, test_size=0.1)
# We can split the test set again in a validation set and a true test set:
validation_df, test_df = train_test_split(test_df, test_size=0.5)
train_df = train_df.reset_index()
validation_df = validation_df.reset_index()
test_df = test_df.reset_index()

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # Load data and get label
        X = Image.open(self.df['path'][index])
        y = torch.tensor(int(self.df['cell_type_idx'][index]))

        if self.transform:
            X = self.transform(X)

        return X, y
    
# Define the parameters for the dataloader
params = {'batch_size': 4,
          'shuffle': True,
          'num_workers': 6}

In [None]:
# define the transformation of the images.
import torchvision.transforms as trf
composed = trf.Compose([trf.RandomHorizontalFlip(), trf.RandomVerticalFlip(), trf.CenterCrop(256), trf.RandomCrop(224),  trf.ToTensor(),
                        trf.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

# Define the trainingsset using the table train_df and using our defined transitions (composed)
training_set = Dataset(train_df, transform=composed)
training_generator = torch.utils.data.DataLoader(training_set, **params)

# Same for the validation set:
validation_set = Dataset(validation_df, transform=composed)
validation_generator = torch.utils.data.DataLoader(validation_set, **params)

### Model

In [None]:
import torchvision.models as models

def make_model(num_classes: int):
    """Load a vgg16 and add a new head to it."""
    model = models.densenet121(pretrained=True)
    num_ftrs = model.classifier.in_features
    model.classifier = torch.nn.Linear(num_ftrs, num_classes)
    return model

In [None]:
model = make_model(7)

In [None]:
model.eval()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
device = torch.device("cpu")

### Train Model

In [None]:
max_epochs = 10
trainings_error = []
validation_error = []
for epoch in range(max_epochs):
    print('epoch:', epoch)
    count_train = 0
    trainings_error_tmp = []
    model.train()
    for data_sample, y in training_generator:
        data_gpu = data_sample.to(device)
        y_gpu = y.to(device)
        output = model(data_gpu)
        err = criterion(output, y_gpu)
        err.backward()
        optimizer.step()
        trainings_error_tmp.append(err.item())
        count_train += 1
        if count_train >= 100:
            count_train = 0
            mean_trainings_error = np.mean(trainings_error_tmp)
            trainings_error.append(mean_trainings_error)
            print('trainings error:', mean_trainings_error)
            break
    with torch.set_grad_enabled(False):
        validation_error_tmp = []
        count_val = 0
        model.eval()
        for data_sample, y in validation_generator:
            data_gpu = data_sample.to(device)
            y_gpu = y.to(device)
            output = model(data_gpu)
            err = criterion(output, y_gpu)
            validation_error_tmp.append(err.item())
            count_val += 1
            if count_val >= 10:
                count_val = 0
                mean_val_error = np.mean(validation_error_tmp)
                validation_error.append(mean_val_error)
                print('validation error:', mean_val_error)
                break
plt.plot(trainings_error, label = 'training error')
plt.plot(validation_error, label = 'validation error')
plt.legend()
plt.show()

### Evaluate model

In [None]:
model.eval()
test_set = Dataset(validation_df, transform=composed)
test_generator = torch.utils.data.SequentialSampler(validation_set)
result_array = []
gt_array = []
for i in test_generator:
    data_sample, y = validation_set.__getitem__(i)
    data_gpu = data_sample.unsqueeze(0).to(device)
    output = model(data_gpu)
    result = torch.argmax(output)
    result_array.append(result.item())
    gt_array.append(y.item())
correct_results = np.array(result_array)==np.array(gt_array)
sum_correct = np.sum(correct_results)
accuracy = sum_correct/test_generator.__len__()
print(accuracy)

In [None]:
torch.save(model.state_dict(), "resnet-skin-cancer-detection")