###Necessary imports 

In [None]:
from __future__ import print_function
#%matplotlib inline
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
import csv

# Set random seed for reproducibility
manualSeed = 999
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)


torch.manual_seed(manualSeed)

Random Seed:  999


<torch._C.Generator at 0x7ff001480970>

###Select appropriate device (CUDA or CPU) as per availability

In [None]:
use_cuda = True
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")


###To load data, google drive is mounted.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


###The path for the train and test data is given.
####Please change accordingly.

In [None]:
train_dir = "/content/gdrive/MyDrive/Colab Notebooks/data/train/train"
test_dir = "/content/gdrive/MyDrive/Colab Notebooks/data/test"

####The class labels are extracted depending of each subfolder name inside the train folder

In [None]:
import pathlib
classes = sorted([j.name.split('/')[-1] for j in pathlib.Path(train_dir).iterdir()])
classes = classes[1:]
classes

['Cancer', 'Connective', 'Immune', 'Normal']

##### Spatial size of training images. All images will be resized to this size using a transformer. Our 32x32 image will be resized to 128x128. This magnifies the image to and give more details.

In [None]:
image_size = 128

###While transforming the data we want to normalize it. This is done by first calculating the mean and standard deviation of the dataset. 
####The dataset is loaded in batches of 128 images at a time.

In [None]:
#train_dataset = dset.ImageFolder(root=train_dir,
#                           transform=transforms.Compose([
#                               transforms.Resize(image_size),
#                               transforms.ToTensor(),
#                           ]))
#train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 128, shuffle=True)

In [None]:
def mean_and_std(loader):
  mean =0.0
  std =0.0
  total_img_count =0
  for images, _ in loader:
    image_count_batch = images.size(0)
    images = images.view(image_count_batch, images.size(1), -1)
    mean += images.mean(2).sum(0)
    std += images.std(2).sum(0)
    total_img_count += image_count_batch
  
  mean /= total_img_count
  std /= total_img_count

  return mean, std


###When dataset is passed to function mean_and_std(), the following is output mean and standard deviation. 
####mean = [0.7203, 0.5801, 0.8061]
####std = [0.1581, 0.1728, 0.1076]


In [None]:
#mean, std = mean_and_std(train_loader)
#print(mean)
#print(std)

In [None]:
mean = torch.Tensor([0.7203, 0.5801, 0.8061])
std = torch.Tensor([0.1581, 0.1728, 0.1076])

###The dataset is loaded from the train folder and following transformations are performed - 
####1. transforms.Resize - Resizing the images from input size to 128x128
####2. transforms.RandomHorizontalFlip - Random flipping of the images so that the train data is more variable and better trains the model.
####3. transforms.RandomRotation(10) - Randomly rotating train images by 10 degress, again to introduce more variability to the train data
####4. transforms.ToTensor - All the values are changed to a Tensor that is necessary as all functionalities Pytorch work on tensors.
####5. transforms.Normalize(mean, std) - Normalization of the train data based on the calculated mean and standard deviation

In [None]:
from torchvision.transforms.transforms import RandomHorizontalFlip, RandomRotation
train_dataset = dset.ImageFolder(root=train_dir,
                           transform=transforms.Compose([
                               transforms.Resize(image_size),
                               transforms.RandomHorizontalFlip(),
                               transforms.RandomRotation(10),
                               transforms.ToTensor(),
                               transforms.Normalize(mean, std),
                           ]))

In [None]:
len(train_dataset)

1700

###The train data in the train folder is further divided into training set and validation set. The split is approximately 75% and 25% data respectively.

In [None]:
train_set, val_set = torch.utils.data.random_split(train_dataset, [1250, 450])

###The convulutional model is defined below having 8 layers -
####Input shape [batch_size, no. of channels, widht and height of image, thus [128, 3, 128, 128]
####1. ConV layer - input channels = 3 (As RGB for coloured images) output channels = 12, Batch normalization and activation function as RelU
####Output shape = [128, 12, 128, 128]
####2. Max Pooling layer - of kernel size 2 
####Output shape = [128, 12, 64, 64]
####3. ConV layer - input channels = 12 (As RGB for coloured images) output channels = 20, Batch normalization and activation function as RelU
####Output shape = [128, 20, 64, 64]
####4. Max Pooling layer - of kernel size 2 
####Output shape = [128, 12, 32, 32]
####5. ConV layer - input channels = 20 (As RGB for coloured images) output channels = 32, Batch normalization and activation function as RelU
####Output shape = [128, 32, 32, 32]
####6. ConV layer - input channels = 32 (As RGB for coloured images) output channels = 64, Batch normalization and activation function as RelU
####Output shape = [128, 64, 32, 32]
####7. Fully connected layer - in features = 64 * 32 * 32, out features = 1028
####8. Fully connected layer - in features = 1028, out features = 4 (no. of class labels)

In [None]:
import collections
model = nn.Sequential(collections.OrderedDict([
          ('conv1', nn.Conv2d(3,12,3,padding=1)),
          ('bn1',   nn.BatchNorm2d(num_features=12)),
          ('relu1', nn.ReLU()),
          ('mp1',   nn.MaxPool2d(2)),

          ('conv2', nn.Conv2d(12,20,3,padding=1)),
          ('relu2', nn.ReLU()),
          ('mp2',   nn.MaxPool2d(2)),

          ('conv3', nn.Conv2d(20,32,3,padding=1)),
          ('relu3', nn.ReLU()),

          ('conv4', nn.Conv2d(32,64,3,padding=1)),
          ('bn3',   nn.BatchNorm2d(num_features=64)),
          ('relu4', nn.ReLU()),
          
          
          # Put in a linear layers ...
          ('flatten', nn.Flatten()),                                          
          ('fc1', nn.Linear(64*32*32,1028)),
          ('relu1', nn.ReLU()),
          ('fc2', nn.Linear(1028,4)),
        ]))

###Functions training_loop and validatio_loop are defined below for training and validation of our ConVnet model defined above.
####The model is set to train mode in training loop and eval mode in validation loop.
####For every epoch we load train/validation data in batches. 
####While training, the outputs from the model is calculated, which are in turn passed in a loss function. 
####The optimizier is reset to zero gradient for every epoch. And based on the learning rate and the loss gradient a step is taken by the optimizer.
####The predictions are made and cross checked with actual labels for calculating the correct predicted labels. This in turn helps in calculating the training accuracy of the model in the current epoch.
####In every epoch the model in its current state is also used to make predictions on the validation set using validation loop function (logic similar to training loop) and accuracy on the validation set is used to keep a check of best accuracy.
####The model state having the best validation set accuracy is saved as the model with best settings.  

In [None]:
import datetime
epoch_print_gap = 1

def training_loop(n_epochs, optimizer, model, device, loss_fn, train_loader, validation_loader):
    model.train()
    model = model.to(device)
    best_acc = 0
    val_acc = 0.0
    for epoch in range(1, n_epochs + 1):
        print('Epoch: ', epoch)
        loss_train = 0.0
        correct = 0
        for imgs, labels in train_loader:
            outputs = model(imgs.to(device))
            loss = loss_fn(outputs, labels.to(device))
            labels = labels.to(device)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            loss_train += loss.item()

            pred = outputs.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(labels.view_as(pred)).sum().item()

        loss_train /= len(train_loader.dataset)
        curr_acc = 100. * correct / len(train_loader.dataset)

        val_acc = validation_loop(model, device, validation_loader)

        if val_acc > best_acc:
          model_save_name = 'best_model.pt'
          path = F"/content/gdrive/MyDrive/Colab Notebooks/{model_save_name}" 
          torch.save(model.state_dict(), path)
          print('model saved')
          best_acc = val_acc

        if epoch == 1 or epoch % epoch_print_gap == 0:
            print('{} Epoch {}, Training loss {}'.format(
                datetime.datetime.now(), epoch, float(loss_train)))
            print('\nTrain set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
                loss_train, correct, len(train_loader.dataset),  curr_acc))
            
            print('Validation set: Accuracy: ({:.0f}%)\n'.format(val_acc))
            

def validation_loop(model, device, validation_loader):
  model.eval()
  model = model.to(device)

  correct = 0
  val_acc = 0.0

  with torch.no_grad():
    for data, labels in validation_loader:
      output = model(data.to(device))
      labels = labels.to(device)

      pred = output.argmax(dim=1, keepdim=True)
      correct += pred.eq(labels.view_as(pred)).sum().item()
  
  val_acc = 100. * correct / len(validation_loader.dataset)
  return val_acc

In [None]:
#learning rate
lr = 0.005

# set up a train and a validation loader objects to use minibatches of 128 images in a batch with shuffle True
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle=True)
validation_loader = torch.utils.data.DataLoader(val_set, batch_size = 128, shuffle=True)

# we set the optimiser as Stochastic Gradient Descent
optimizer = optim.SGD(model.parameters(), lr=lr)

###To handle the imbalance in the classes in training data the weights parameter of the loss function is used. 
####Classes Cancer, Connective and Immune have 500 images each while Normal class has only 200 images.
####Thus weights are calculated as follows
###sum of all classes/sum of images in each class -
###For Cancer,  Connective and Immune 1700/500 = 3.4
###For Normal 1700/200 = 8.5

In [None]:
weights = torch.tensor([3.4, 3.4, 3.4, 8.5])
weights = weights.to(device)

# we set the loss function to optimise. Cross Entropy is usually the best for 
# classification
loss_fn = nn.CrossEntropyLoss(weight=weights )

###With 20 epochs the ConVnet is trained

In [None]:
n_epochs = 20 


# train the CNN
training_loop(
    n_epochs = n_epochs, 
    optimizer = optimizer,
    model = model, 
    device = device,
    loss_fn = loss_fn,
    train_loader = train_loader,
    validation_loader = validation_loader,
)

Epoch:  1
model saved
2022-03-25 14:27:05.641366 Epoch 1, Training loss 0.011520847606658936

Train set: Average loss: 0.0115, Accuracy: 538/1250 (43%)

\Validation set: Accuracy:(80%)

Epoch:  2
2022-03-25 14:27:35.431325 Epoch 2, Training loss 0.009432905387878419

Train set: Average loss: 0.0094, Accuracy: 979/1250 (78%)

\Validation set: Accuracy:(80%)

Epoch:  3
model saved
2022-03-25 14:28:09.447467 Epoch 3, Training loss 0.008464416694641113

Train set: Average loss: 0.0085, Accuracy: 981/1250 (78%)

\Validation set: Accuracy:(84%)

Epoch:  4
model saved
2022-03-25 14:28:40.485148 Epoch 4, Training loss 0.007263839340209961

Train set: Average loss: 0.0073, Accuracy: 1019/1250 (82%)

\Validation set: Accuracy:(86%)

Epoch:  5
model saved
2022-03-25 14:29:09.699663 Epoch 5, Training loss 0.005929233837127686

Train set: Average loss: 0.0059, Accuracy: 1049/1250 (84%)

\Validation set: Accuracy:(86%)

Epoch:  6
2022-03-25 14:29:36.900996 Epoch 6, Training loss 0.005051283740997315

###The model with best settings which was saved in loaded for making predictions on the test data and submit predictions to Kaggle.

In [None]:
model_save_name = 'best_model.pt'
path = F"/content/gdrive/MyDrive/Colab Notebooks/{model_save_name}" 
model.load_state_dict(torch.load(path))

<All keys matched successfully>

### We set up a loader for test data, using a single batch - 

#### Transformations are similar to train data excpet transforms.RandomHorizontalFlip transforms.RandomRotation



In [None]:
test_dataset = dset.ImageFolder(root=test_dir,
                           transform=transforms.Compose([
                               transforms.Resize(image_size),
                               transforms.CenterCrop(image_size),
                               transforms.ToTensor(),
                               transforms.Normalize(mean, std),
                           ]))


test_loader = torch.utils.data.DataLoader(test_dataset)

In [None]:
test_dataset

Dataset ImageFolder
    Number of datapoints: 3200
    Root location: /content/gdrive/MyDrive/Colab Notebooks/data/test
    StandardTransform
Transform: Compose(
               Resize(size=128, interpolation=bilinear, max_size=None, antialias=None)
               CenterCrop(size=(128, 128))
               ToTensor()
               Normalize(mean=tensor([0.7203, 0.5801, 0.8061]), std=tensor([0.1581, 0.1728, 0.1076]))
           )

###To make the predictions on the test data we utilize our model with best settings and store the predictions as a csv file.  

In [None]:
pred =[]
model.eval()
for data, target in test_loader:
  
  output = model(data.to(device))
  pred.append(output.argmax(dim=1, keepdim=True))

The predictions are changed to their respective class label strings and stored in a dictionary

In [None]:
preds = {}
for i in range(len(pred)):
  preds[test_dataset.imgs[i][0].split('/')[-1]]=classes[pred[i]]
  


The dictionary is converted to a csv file.

In [None]:
with open('/content/gdrive/My Drive/Colab Notebooks/predictions.csv', 'w') as f:
    f.write("%s,%s\n"%('Id','Type'))
    for key in preds.keys():
        f.write("%s,%s\n"%(key,preds[key]))