# Boilerplate notebook

In [None]:
# Matplotlib
import matplotlib.pyplot as plt
# Numpy
import numpy as np
# Pillow
from PIL import Image
# Torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms
from torchsummary import summary
# Misc
import time
from datetime import datetime

# 1. Download dataset

In [None]:
!git clone -b data https://github.com/Oxiang/50.039-Deep-Learning.git

In [None]:
!sudo apt-get install tree

In [None]:
cd 50.039-Deep-Learning

In [None]:
%%bash

(
tree dataset -d
) 

# 2. Dataset high-level info

The images stored in the **./dataset** folder and its subfolder consists of 150 by 150 pixels greyscale images, representing X-Ray pictures of lungs.

The images, consists of X-ray pictures of the following:

| Description                              | Class index | Tensor  | Class label        |
| ---------------------------------------- | ----------- | ------- | ------------------ |
| People with no infection diagnosis       | 0           | [1 0 0] | normal             |
| People with infected lungs and non-covid | 1           | [0 1 0] | infected_non_covid |
| People with infected lungs and covid     | 2           | [0 0 1] | infected_covid     |



In [None]:
classes = {0: 'normal', 1: 'infected_non_covid', 2: 'infected_covid'}
groups = ['train', 'test', 'val']
dataset_numbers = {
    'train_normal': 1341,
    'train_infected_non_covid': 2530,
    'train_infected_covid': 1345,
    'val_normal': 8,
    'val_infected_non_covid': 8,
    'val_infected_covid': 8,    
    'test_normal': 234,
    'test_infected_non_covid': 242,
    'test_infected_covid': 138,
}
dataset_paths = {
    'train_normal': './dataset/train/normal/',
    'train_infected_non_covid': './dataset/train/infected/non-covid/',
    'train_infected_covid': './dataset/train/infected/covid/',
    'val_normal': './dataset/val/normal/',
    'val_infected_non_covid': './dataset/val/infected/non-covid/',
    'val_infected_covid': './dataset/val/infected/covid/',    
    'test_normal': './dataset/test/normal/',
    'test_infected_non_covid': './dataset/test/infected/non-covid/',
    'test_infected_covid': './dataset/test/infected/covid/',    
}

View one of the images and its properties. These images consist of a Numpy array, with values ranging between 0 and 255. These values will be normalized.

In [None]:
path_to_file = './dataset/train/normal/1.jpg'
with open(path_to_file, 'rb') as f:
    im = np.asarray(Image.open(f))
    plt.imshow(im)
f.close()
print('Image shape is: {}'.format(im.shape))
# Images are defined as a Numpy array of values between 0 and 256
print('Image as a numpy array is:\n {}'.format(im))

# 3. Creating a Dataset object

## 3.1 General Dataset object that is custom made for train, val, test to individually use

length method ( __ len __ )

> return the number of images present in the dataset

getitem method ( __ getitem __ )

> fetch an image and its label, using a single index value. Returns the image, along with a one-hot vector corresponding to the class of the object. Both returned parameters will be torch tensors.
- [1, 0,0] for normal class
- [0, 1, 0] for infected_non_covid class
- [0, 0, 1] for infected_covid class

In [None]:
class Lung_Dataset(Dataset):
    """
    Generic Dataset class.
    """
    
    def __init__(self, groups, dataset_numbers, dataset_paths):
        """
        Constructor for generic Dataset class - assembles
        the important parameters in attributes.

        Parameters
        ----------
        groups : str
            Allowed values: train, val, test
        dataset_numbers : dict
            Count of each class within specified group
        dataset_paths : dict
            Path to each class within specified group
        """

        self.img_size = (150, 150)
        self.classes = {
            0: 'normal',
            1: 'infected_non_covid',
            2: 'infected_covid'
        }        
        self.groups = groups
        self.dataset_numbers = dataset_numbers
        self.dataset_paths = dataset_paths
        
        
    def describe(self):
        """
        Descriptor function.
        Will print details about the dataset when called.
        """
        
        # Generate description
        msg = "This is the {} dataset of the Lung Dataset".format(self.groups)
        msg += " used for the Small Project Demo in the 50.039 Deep Learning class"
        msg += " in March 2021. \n"
        msg += "It contains a total of {} images, ".format(sum(self.dataset_numbers.values()))
        msg += "of size {} by {}.\n".format(self.img_size[0], self.img_size[1])
        msg += "The images are stored in the following locations "
        msg += "and each one contains the following number of images:\n"
        for key, val in self.dataset_paths.items():
            msg += " - {}, in folder {}: {} images.\n".format(key, val, self.dataset_numbers[key])
        print(msg)
        
    
    def open_img(self, group_val, class_val, index_val):
        """
        Opens image with specified parameters.
        
        Parameters:
        - group_val should take values in 'train', 'test' or 'val'.
        - class_val variable should be set to 'normal' or 'infected_non_covid' or 'infected_covid'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        
        Returns loaded image as a normalized Numpy array.
        """
        
        # Asserts checking for consistency in passed parameters
        err_msg = "Error - group_val variable should be set to 'train', 'test' or 'val'."
        assert group_val in self.groups, err_msg
        
        err_msg = "Error - class_val variable should be set to 'normal' or 'infected_non_covid' or 'infected_covid."
        assert class_val in self.classes.values(), err_msg
        
        max_val = self.dataset_numbers['{}_{}'.format(group_val, class_val)]
        err_msg = "Error - index_val variable should be an integer between 0 and the maximal number of images."
        err_msg += "\n(In {}/{}, you have {} images.)".format(group_val, class_val, max_val)
        assert isinstance(index_val, int), err_msg
        assert index_val >= 0 and index_val <= max_val, err_msg
        
        # Open file as before
        path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}'.format(group_val, class_val)], index_val)
        with open(path_to_file, 'rb') as f:
            # Convert to Numpy array and normalize pixel values by dividing by 255.
            im = np.asarray(Image.open(f))/255
        f.close()
        return im
    
    
    def show_img(self, group_val, class_val, index_val):
        """
        Opens, then displays image with specified parameters.
        
        Parameters:
        - group_val should take values in 'train', 'test' or 'val'.
        - class_val variable should be set to 'normal' or 'infected'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        """
        
        # Open image
        im = self.open_img(group_val, class_val, index_val)
        
        # Display
        plt.imshow(im)

    def __len__(self):
        """
        Length special method, returns the number of images in dataset.
        """
        
        # Length function
        return sum(self.dataset_numbers.values())
    
    
    def __getitem__(self, index):
        """
        Getitem special method.
        
        Expects an integer value index, between 0 and len(self) - 1.
        
        Returns the image and its label as a one hot vector, both
        in torch tensor format in dataset.
        """
        
        # Get item special method
        first_val = int(list(self.dataset_numbers.values())[0])
        second_val = int(list(self.dataset_numbers.values())[1])
        if index < first_val:
            class_val = 'normal'
            label = torch.Tensor([1, 0, 0])
        elif index < (first_val+second_val):
            class_val = 'infected_non_covid'
            index = index - first_val
            label = torch.Tensor([0, 1, 0])
        else:
            class_val = "infected_covid"
            index = index - (first_val+second_val)
            label = torch.Tensor([0, 0, 1])
        im = self.open_img(self.groups, class_val, index)
        im = transforms.functional.to_tensor(np.array(im)).float()
        return im, label

In [None]:
dataset_numbers = {
    'train': {
        'train_normal': 1341,
        'train_infected_non_covid': 2530,
        'train_infected_covid': 1345,
    },
    'val': {
        'val_normal': 8,
        'val_infected_non_covid': 8,
        'val_infected_covid': 8,
    },
    'test': {
        'test_normal': 234,
        'test_infected_non_covid': 242,
        'test_infected_covid': 138,
    }
}
dataset_paths = {
    'train': {
        'train_normal': './dataset/train/normal/',
        'train_infected_non_covid': './dataset/train/infected/non-covid/',
        'train_infected_covid': './dataset/train/infected/covid/',
    },
    'val': {
        'val_normal': './dataset/val/normal/',
        'val_infected_non_covid': './dataset/val/infected/non-covid/',
        'val_infected_covid': './dataset/val/infected/covid/',
    },
    'test': {
        'test_normal': './dataset/test/normal/',
        'test_infected_non_covid': './dataset/test/infected/non-covid/',
        'test_infected_covid': './dataset/test/infected/covid/',
    }
}

In [None]:
def verify_dataset(group,dataset,image_overall_index=7,class_val='normal',
                   image_specific_dataset_index=1):
  print('Verify the special methods __len__ and __get_item__')
  print('Number of images in {} dataset: {}'.format(group, len(dataset)))
  print('Details for image id {} from the {} dataset'.format(
      image_overall_index,
      group
  ))
  im, class_oh = dataset[image_overall_index]
  print('Sample image shape: {}'.format(im.shape))
  print('Sample image: {}'.format(im))
  print('Sample image class: {}'.format(class_oh))

  print('\nVerify the open_img and show_img functions')
  print('Open and show image {} from the {}_{} dataset'.format(
      image_specific_dataset_index,
      group,
      class_val
  ))
  im = dataset.open_img(group, class_val, image_specific_dataset_index)
  print('Same sample image shape: {}'.format(im.shape))
  print('Same sample image: {}'.format(im))
  dataset.show_img(group, class_val, image_specific_dataset_index)

## 3.2 Train dataset

In [None]:
train_group = 'train'
ld_train = Lung_Dataset(
    train_group,
    dataset_numbers[train_group],
    dataset_paths[train_group]
)
ld_train.describe()

In [None]:
verify_dataset(train_group,ld_train,1)

## 3.3 Validation dataset

In [None]:
val_group = 'val'
ld_val = Lung_Dataset(
    val_group,
    dataset_numbers[val_group],
    dataset_paths[val_group]
)
ld_val.describe()

In [None]:
verify_dataset(val_group,ld_val,1)

## 3.4 Test dataset

In [None]:
test_group = 'test'
ld_test = Lung_Dataset(
    test_group,
    dataset_numbers[test_group],
    dataset_paths[test_group]
)
ld_test.describe()

In [None]:
verify_dataset(test_group,ld_test,1)

# 4. Data visualization

This requires a `grouped bar chart`. Refer to [matplotlib Grouped bar chart with labels](https://matplotlib.org/stable/gallery/lines_bars_and_markers/barchart.html) for starter code

<u>**Discuss whether or not the dataset is balanced between classes, uniformly distributed, etc.**</u>

**Training set**

The train data for the different classes are imbalanced. From the graph plotted below, the `infected_non_covid` class has significantly more data points than the other classes. Overall, the ratio `normal:infected_non_covid:infected_covid` is approximately `1:2:1`.

This could present more complications if the model is trained in a stacking manner by first training normal vs infected. The ratio of `normal:infected` would be a ratio of `1:3` which is more imbalanced.

**Testing set**

The test set is also slightly imbalanced with the ratio `normal:infected_non_covid:infected_covid` being approximately `2:2:1`. However, this is not as bad as an imbalanced training set because the test set will not affect the model's parameter tuning.

**Validation set**

The val set is uniformly distributed between the three classes. However, it is glaring that there are only 8 validation samples for the 3 classes. Considering the ratio of `train:val:test`, the number of validation samples is far too low. For example. with reference to the infected_non_covid class, the ratio of `train:val:test` is `316:1:30`. which is quite far off from the recommended ratios like `80:10:10` or `8:1:1` as indicated by [Stanford's CS230](https://cs230.stanford.edu/blog/split/).

In [None]:
labels = ['normal', 'infected_non_covid', 'infected_covid']

train_normal_inc_ic = list(ld_train.dataset_numbers.values())
val_normal_inc_ic = list(ld_val.dataset_numbers.values())
test_normal_inc_ic = list(ld_test.dataset_numbers.values())

x = np.arange(len(labels))  # the label locations
width = 0.25  # the width of the bars

fig, ax = plt.subplots(figsize=(10,10))
rects1 = ax.bar(x - width, train_normal_inc_ic, width, label='train')
rects2 = ax.bar(x + width, val_normal_inc_ic, width, label='val')
rects3 = ax.bar(x, test_normal_inc_ic, width, label='test')

ax.set_ylabel('Number of datapoints')
ax.set_title('Number of datapoints with respect to each dataset and class')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

fig.tight_layout()
plt.show()

# 4. Why normalize the data?

To recap the normalization can be found in the `Lung_Dataset` class' `open_img` function which did the following normalization.

```python
# Convert to Numpy array and normalize pixel values by dividing by 255.
im = np.asarray(Image.open(f))/255
```

Images have RGB ranges from 0-255. Considering various activation functions like `sigmoid` such a large range would mean that for vastly different values like 100 and 255, not much difference can be seen when passed into the `sigmoid` activation function. Both would produce a value that is close to 1.

Taking the same values as reference, if we divide by 255, for a value of 100,  $\frac{100}{255}$ we get approximately 0.39. Then for a value of 255, $\frac{255}{255}$ we get 1. For the initial value of 100 that becomes 0.39 after the division, passing it into `sigmoid(0.39)` produces a value of 0.596. Meanwhile for the initial value of 255 that becomes 1 after division, passing it into `sigmoid(1)` produces a value of 0.731. This difference in value allows us to extract meaningful differences in the pixel values.


# 5. Other possible pre-processing

Form the plot below, which is based on the Training set for normal images as reference, it is evident that there are several differences in the photo dimensions and photo environment. 

For example, comparing image_index 1 and image_index 28 there is a clear difference in the lighting, Image_index 28 is a lot brighter. One pre-processing step could be to use histogram normalization. There is a paper that recommends 14 possible normalization algorithms that can be performed (Leszczynski, 2010)

Aother example is comparing "skinny" images like image_index 1 and image_index 31 where there is significantly more dark backgrounds at the side compares to images like image_index 12. Perhaps a edge detection algorithm can be applied to just filter the relevant parts of the image which are the lungs.

In [None]:
row = 2
col = 2

selected_indices = [1,12,28,31]
f, axarr = plt.subplots(row,col,figsize=(10,7))
counter = 0
for row_index in range(row):
  for col_index in range(col):
    image_index = selected_indices[counter]
    im = ld_train.open_img('train', 'normal', image_index)
    axarr[row_index,col_index].set_title('Image index: {}'.format(image_index))
    axarr[row_index,col_index].imshow(im)
    counter += 1

# 6. Creating a data loader object

In [None]:
bs_val = 4
train_loader = DataLoader(ld_train, batch_size = bs_val, shuffle = True)
test_loader = DataLoader(ld_test, batch_size = bs_val, shuffle = True)
val_loader = DataLoader(ld_val, batch_size = bs_val, shuffle = True)

# 7. Model



In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Conv2D: 1 input channel, 8 output channels, 3 by 3 kernel, stride of 1.
        self.conv1 = nn.Conv2d(1, 4, 3, 1)
        # change the linear layer to output 3 dim
        self.fc1 = nn.Linear(87616, 3)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        output = F.log_softmax(x, dim = 1)
        return output

In [None]:
# Activate gpu
if torch.cuda.is_available():  
    print('using GPU')
    device = "cuda:0" 
else:  
    device = "cpu"
model = Net().to(torch.device(device))

In [None]:
summary(model, (1, 150, 150))

# 8. Training the model

Reference material: [Towards data science: PyTorch [Tabular] — Multiclass Classification](https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab)

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    return acc

In [None]:
# Define criterion and optimizer, epoch
epochs = 1
lr = 0.0001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)

In [None]:
running_loss = 0
print_every = 40
steps = 0 
start = time.time()

accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

for e in range(epochs):
    train_epoch_loss = 0
    train_epoch_acc = 0
    
    steps = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        
        steps += 1

        optimizer.zero_grad()

        output = model.forward(X_train_batch)
        train_loss  = criterion(output, torch.max(y_train_batch, 1)[1])
        train_acc = multi_acc(output, torch.max(y_train_batch, 1)[1])
        train_loss.backward()
        optimizer.step()

        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
    with torch.no_grad():
        val_epoch_loss = 0
        val_epoch_acc = 0
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)

            y_val_pred = model.forward(X_val_batch)

            val_loss = criterion(y_val_pred, torch.max(y_val_batch, 1)[1])
            val_acc = multi_acc(y_val_pred, torch.max(y_val_batch, 1)[1])

            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
        
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
    now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    print("Epoch: {}/{} - {} - ".format(e+1, epochs, now),
      "Training Loss: {:.4f} - ".format(train_epoch_loss/len(train_loader)),
      "Validation Loss: {:.4f} - ".format(val_epoch_loss/len(val_loader)),
      "Validation Accuracy: {:.4f}".format(val_epoch_acc/len(val_loader))),
    "Training Accuracy: {:.4f}".format(train_epoch_acc/len(train_loader))

# autosave model
end_model_time = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
model_name = 'boilerplate_net'
checkpoint = {
    'c_lr': lr,
    'model_name': model_name,
    'c_epochs': epochs,
}
path = './model_{}_{}_{}'.format(epochs, model_name, end_model_time)
torch.save(checkpoint, path)

# References

Leszczynski, M. (2010). Image Preprocessing for Illumination Invariant Face 
Verification. Journal of telecommunications and information technology, 19-25.