# Get the data (Recap)

First we need access to data.
- You can use this link to add the data to your drive: https://drive.google.com/drive/folders/1pHNxZVrlcKh5usWoNC_V7gR2WdeDutjv
- If you have not done this yet, right click on the **CS4MS_Data** folder and click on the **Add shortcut to Drive** option.
- Inside the folder **CS4MS_Data** you will see the folder **HAM10000** - this is the dataset (set of images) we will be working with.

Now you can run the next cell

In [None]:
# Imports
!pip install --upgrade -q gspread
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials

import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torchvision
import numpy as np
import random

import datetime
import pytz

tz = pytz.timezone('Europe/Berlin')

gc = gspread.authorize(GoogleCredentials.get_application_default())



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_dir = "/content/drive/My Drive/CS4MS_Data/HAM10000"

classes = [ 'actinic keratoses', 'basal cell carcinoma', 'benign keratosis-like lesions', 
           'dermatofibroma','melanoma', 'melanocytic nevi', 'vascular lesions']

In [None]:
### Change your Name start ###
student_name = "Tobias"
### Change your Name end ###

### Check if name changed ###
assert student_name != "yourName"

In [None]:
#@title Result Form
gsheet = gc.open_by_url("https://docs.google.com/spreadsheets/d/1i1lLpburGOuTb5wGx88cuQi0ItA7JOjoNIRbBhEyaH0/edit#gid=1494605435")

def write_result(task_number, result=None):
  worksheet = gsheet.worksheet(f"task{task_number}")
  current_time = datetime.datetime.now(tz).strftime("%X")
  current_date = str(datetime.date.today())
  if result:
    worksheet.append_row([student_name, current_time, current_date, result])
    print(f"Task {task_number} successfully solved by {student_name} at {current_time} with result: {result}")
  else:
    worksheet.append_row([student_name, current_time, current_date])
    print(f"Task {task_number} successfully solved by {student_name} at {current_time}")

print("Reporting enabled - write_result(number_of_task, result='your result') ")


In [None]:
# quick example for object oriented programming: working with paths (folders and files)
from pathlib import Path
gdrive_connection_success = Path(data_dir) 

if gdrive_connection_success.is_dir():
    write_result(0)
else:
    print("your folder is not mounted correclty - contact the tutors!")

# Data Augmentation

It is a common fact that medical data is scarce. But to learn a very good model, the network needs a lot of data. So to tackle the problem we perform data augmentation.

Data augmentation is a strategy that enables practitioners to significantly increase the diversity of data available for training models, without actually collecting new data. 

Data augmentation techniques such as cropping, padding, and horizontal flipping are commonly used to train large neural networks.

![Data Augmentation](https://cdn-images-1.medium.com/max/1000/1*C8hNiOqur4OJyEZmC7OnzQ.png)
[Source](https://cdn-images-1.medium.com/max/1000/1*C8hNiOqur4OJyEZmC7OnzQ.png) 



In [None]:
# Downloading cat img
!wget https://raw.githubusercontent.com/IFL-CAMP/AI4MDs_21/main/images/cat.jpg
from PIL import Image
# Opening Cat img
cat = Image.open("cat.jpg")

#Visualizing cat img
plt.axis('off')
plt.imshow(cat)

In [None]:
def imshow(img):
    npimg = img.numpy()
    fig, ax = plt.subplots(figsize=(30, 30))
    ax.axis('off')
    ax.imshow(np.transpose(npimg, (1, 2, 0)))



In [None]:
augmentation = transforms.Compose([
                                  # resize image to the network input size
                                  transforms.Resize((224,224)),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.RandomRotation(degrees=60),
                                  transforms.RandomCrop(180),
                                  transforms.ToTensor(),
                                   ])
# Complete list: https://pytorch.org/docs/stable/torchvision/transforms.html
# Examples: 
# torchvision.transforms.RandomErasing()
# torchvision.transforms.RandomAffine(degrees, translate=None, scale=None, shear=None, resample=False, fillcolor=0) --> transforms.RandomAffine(degrees=20, shear=[0,50]),
# torchvision.transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=0) --> transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
# transforms.GaussianBlur(kernel_size, sigma=(0.1, 2.0))
# transforms.RandomCrop(180),

In [None]:
images = []
for i in range(16):
    temp_im = augmentation(cat)
    images.append(temp_im)
    
# show images
imshow(torchvision.utils.make_grid(images))

In [None]:
# Try out your own transformations 
task1_done=False
own_aug =  transforms.Compose([
                              transforms.Resize((224,224)),
                              torchvision.transforms.RandomAffine(degrees=20, shear=20, fillcolor=(255,0,0)),
                              transforms.ToTensor(),
                              torchvision.transforms.ColorJitter(hue=0.4)
                              ])

images = []
for i in range(16):
    temp_im = own_aug(cat)
    images.append(temp_im)  
# show images
imshow(torchvision.utils.make_grid(images))


if len(own_aug.transforms) <= 2:
  print("You have to apply more than 2 transformations")
else:
  task1_done = True
  print("task1 done!")


# Done with Task1

Once you are happy with your augmentation submit your results



In [None]:
effect = "rotation, shear, colorjitter"


if task1_done and effect != "write what the effect of your transformations is - if you want you can give it a score between 0-10":
  effect += "\n"+str(own_aug)
  write_result(1, effect)
else:
  print("you didnt solve task1 yet.")

## Normalization
Data normalization is an important step which ensures that each input parameter (pixel, in this case) has a similar data distribution. This makes convergence faster while training the network. 

Data normalization is done by subtracting the dataset mean from each image and then dividing the result by the dataset standard deviation. The distribution of such data would resemble a Gaussian curve centered at zero. 

Since, skin lesion images are natural images, we use the normalization values (mean and standard deviation) from [Imagenet dataset.](http://www.image-net.org/)
*norm_mean = (0.4914, 0.4822, 0.4465)*

*norm_std = (0.2023, 0.1994, 0.2010)*

This denotes mean and standard deviation for each channel(RGB) of an image.


We perform following data augmentation:
- Resize the image.
- Flipping the image horizontally.
- Randomly rotating image.
- Normalizing the image.

In [None]:

# Imagenet values
norm_mean = (0.4914, 0.4822, 0.4465)
norm_std = (0.2023, 0.1994, 0.2010)

# define the transformaitons the images go through each time it is used for training
# includes augmentation AND normalization as descirbed above
augmentation_train = transforms.Compose([
                                  # resize image to the network input size
                                  transforms.Resize((224,224)),
                                  # randomly perform a horizontal flip of the image
                                  transforms.RandomHorizontalFlip(),
                                  # rotate the image with a angle from 0 to 60 (chosen randomly)
                                  transforms.RandomRotation(degrees=60),
                                  # convert the image into a tensor so it can be processed by the GPU
                                  transforms.ToTensor(),
                                  # normalize the image with the mean and std of ImageNet
                                  transforms.Normalize(norm_mean, norm_std),
                                   ])

In [None]:
images = []
for i in range(16):
    temp_im = augmentation_train(cat)
    images.append(temp_im)  
# show images
imshow(torchvision.utils.make_grid(images))


# Loading the data

Use the **torchvision.datasets.ImageFolder** dataset class. This class requires the dataset to be arranged into folders of their respective class or labels. We already provide the dataset in suitable preprocessed format.

Here we also apply the augmentation that we defined above.

You can check here : https://pytorch.org/docs/stable/torchvision/datasets.html#imagefolder

In [None]:
import torchvision

# create an instance of the image folder class to load images by classes defined with the folders given
dataset = torchvision.datasets.ImageFolder(root= data_dir, transform= augmentation_train)

Let's try to use the __getitem__ method of the ImageFolder class.

In [None]:
# Check the dimension of the 1000th image and its corresponding label

image, label = dataset[1000]
print("Image Shape: {} \n Label: {} \n Lesion Type: {}".format(image.shape, label, classes[label]))

## Task2

sample 16 different random images from the dataset
hint: check out random.sample(range(0,10), 5)

save each image in the images list and print its index and label before adding it. Also add the label to the label_list.
hint: do it in a for loop

finally look at what you just created

In [None]:
randomlist = random.sample(range(0, len(dataset)), 16)
print(randomlist)
label_list = [] #This has to be filled in the for loop
images = [] #This has to be filled in the for loop
for i in randomlist:
    temp_im, label = dataset[i]
    print(f"adding index: {i} with label {label}")
    images.append(temp_im)
    label_list.append(label)

# show images
imshow(torchvision.utils.make_grid(images))

## Submit Task2

In [None]:
result2="Write here what youd when you looked at the visualized images and labels"

Once you noted what you observed in "result" simply run the next cell.

In [None]:
if len(label_list) == 16 and len(images) == 16 and result2!="Write here what you noticed when you looked at the visualized images and labels":
  result2 += "\n" + "labels: " + str(label_list) + "\n" + "indices" + str(randomlist)
  write_result(2, result2)
else:
  print("something didnt go as expected, check if you solved the Task2!")


**Note**: An important aspect is that we only augment the images used for training. So for testing we don't use the geometric augmentations.

In [None]:
# no augmentation for the test data only resizing, conversion to tensor and normalization
augmentation_test = transforms.Compose([
                    transforms.Resize((224,224)),
                    transforms.ToTensor(),
                    transforms.Normalize(norm_mean, norm_std),
                    ])


# Train, Test and Validation Split
It is a best practice to split the entire dataset into 3 parts:
- Train: Used to train a network.
- Validation: Fine tune the network.
- Test: Kept as unseen data to gauge the performance of out trained network.


The splitting should be done class wise so that we have equal representation of all classes in each subset of the data.

![](https://github.com/IFL-CAMP/ML4MS_2020/blob/master/images/class_dist.png?raw=true)

In [None]:
import torch
from sklearn.model_selection import train_test_split

classes = [ 'actinic keratoses', 'basal cell carcinoma', 'benign keratosis-like lesions', 
           'dermatofibroma','melanoma', 'melanocytic nevi', 'vascular lesions']

# get the total amount of images in the dataset
num_train = len(dataset)

# create a list of indices for the whole dataset
indices = list(range(num_train))

# get the class labels from the dataset object (0-6)
class_labels = dataset.targets

# define the percentage of data that is not used for training
split_size = 0.2

# call a function of sklarn that takes care of splitting the dataset into training and validation+testing
train_indices, test_indices, class_labels_train, class_labels_test = train_test_split(indices,
                                                                                       class_labels,
                                                                                       test_size=split_size,
                                                                                       shuffle=True,
                                                                                       stratify= class_labels,
                                                                                       random_state=42)

# call a function of sklearn that splits validation+testing into validation and testing
train_indices, val_indices = train_test_split(train_indices,
                                               test_size=split_size,
                                               shuffle=True,
                                               stratify= class_labels_train,
                                               random_state=42)

# Creating data samplers and loaders using the indices:
SubsetRandomSampler = torch.utils.data.sampler.SubsetRandomSampler

# create instances of a torch class for picking random samples from our dataset
train_samples = SubsetRandomSampler(train_indices)
val_samples = SubsetRandomSampler(val_indices)
test_samples = SubsetRandomSampler(test_indices)

# Dataloader

We will now use the dataloader to load the entire dataset in small batches.

**Epochs vs Iteration vs Batch size**

One **Epoch** is when an ENTIRE dataset is passed forward and backward through the neural network only ONCE.
Now, we have more than 10000 images in our dataset. It is not possible to feed the entire dataset at once to the computer. So, we divide the data into several smaller batches.

**Batch Size** is number of training examples present in a single step.

**Iterations** are the number of batches needed to complete one epoch.

An Example:

If we have 10000 training images in our dataset. We can divide the dataset into **batches of 500** then it will take **20 iterations** to complete **1 epoch**.


That's where a pytorch dataloader is useful: https://pytorch.org/docs/stable/data.html

In [None]:
# define the batch size for training, val and testing
batch_size, validation_batch_size, test_batch_size = 16, 16, 16

# create and instance of a dataloader for training
train_data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False,num_workers=1, sampler= train_samples)

# overwrite the dataset instance with the test augmentation (this is not nice code)
dataset = torchvision.datasets.ImageFolder(root= data_dir, transform=augmentation_test)
# create instances of a dataloaders for validation and testing
validation_data_loader = torch.utils.data.DataLoader(dataset, batch_size=validation_batch_size, shuffle=False, sampler=val_samples)
test_data_loader = torch.utils.data.DataLoader(dataset, batch_size=test_batch_size, shuffle=False, sampler=test_samples)

In [None]:
print(f"length train_indices train: {len(train_indices)}, number batches {len(train_data_loader)}")

In [None]:
print(f"length val_indices: {len(val_indices)}, number batches {len(validation_data_loader)}")

In [None]:
print(f"length test_indices: {len(test_indices)}, number batches {len(test_data_loader)}")

## Task 3 

The Batchsize detemines the number of data that is used within one step. We noticed that the number of total batches for one training in the training is pretty large (401) our computer is able to process more than 16 (Batchsize) images at a time so we decide to reduce the total number of batches to a maximum of 50. 

Task: Reduce the total number of batches for the training from 401 to 50. 

In [None]:
# reduce number of batches from 401 to 50 
# hint: you can use the "round" command

batch_size = 129
# create and instance of a dataloader for training
train_data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False,num_workers=1, sampler= train_samples)


print(f"length train_indices train: {len(train_indices)}, number batches {len(train_data_loader)} , batch_size: {batch_size}")

come up with a formula on how to calculate the batchsize given a fixed number of total_batches and number_of_data

In [None]:
# hint: total_batches = number_of_data / batchsize 
import math
number_data = 6409
total_batches = 50

def calc_batchsize(number_data, total_batches):
  batch_size = math.ceil(number_data / total_batches)
  return batch_size

batch_size = calc_batchsize(number_data, total_batches)
print(f"length train_indices train: {number_data}, number batches {total_batches} , batch_size: {batch_size}")




Once your calc_batchsize function works you can submit your results - just execute the next cell

In [None]:
import inspect

if calc_batchsize(3209, 50) == 65:
  write_result(3, inspect.getsource(calc_batchsize))
else:
  print("something didnt go as expected, check if you solved the Task3!")


Let us display the loaded batched images:

In [None]:


# functions to show an image
fig = plt.figure(figsize=(30, 30))

def denorm(img):
    img[0,:,:] = (img[0,:,:] * np.asarray(norm_std[0])) + np.asarray(norm_mean[0])
    img[1,:,:] = (img[1,:,:] * np.asarray(norm_std[1])) + np.asarray(norm_mean[1])
    img[2,:,:] = (img[2,:,:] * np.asarray(norm_std[2])) + np.asarray(norm_mean[2])
    return img

def imshow(img):
    img = denorm(img)
    npimg = img.numpy()
    plt.axis('off')
    plt.imshow(np.transpose(npimg, (1, 2, 0)))


# get first batch of training images
dataiter = iter(validation_data_loader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s, ' % classes[labels[j]] for j in range(len(labels))))


Now, we have our dataset loaded Let's try to do something cool with it.

For now, we will use an pre trained network to do inference on the test set of out data. Let's see how is the performance without training

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
# load a pretrained model
from torch import nn
import torchvision

num_classes = len(classes)
net = torchvision.models.resnet18(pretrained = True)

# We replace last layer of resnet to match our number of classes which is 7
# more details next lecture
net.fc = nn.Linear(512, num_classes)
net = net.to(device)

In [None]:
# counter for correct predictions
correct = 0
# counter for all predicted samples
total = 0

# set network to evaluation mode (next lecture)
net.eval()

# this is for next lecture..
with torch.no_grad():
  dataiter = iter(train_data_loader)
  images, labels = dataiter.next()
  images, labels = images.to(device), labels.to(device)
  outputs = net(images)
  _, predicted = torch.max(outputs.data, 1)
  total += labels.size(0)
  correct += (predicted == labels).sum().item()

print("finished ...")

In [None]:
print(f"labels: \t{labels[:10]}")
print(f"predicted: \t{predicted[:10]}")

print(f'Accuracy of the network on the test images: {(100 * correct / total)} %%')

# Next time

Now that we have the dataloaders and augmentations we can finally train our network so that it can actually learn to identify our melanomas.