# Boilerplate notebook

In [None]:
# Matplotlib
import matplotlib.pyplot as plt
# Numpy
import numpy as np
# Pillow
from PIL import Image
# Torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms

# 1. Download dataset

In [None]:
!git clone -b data https://github.com/Oxiang/50.039-Deep-Learning.git

In [None]:
!sudo apt-get install tree

In [None]:
cd 50.039-Deep-Learning

In [None]:
%%bash

(
tree dataset -d
) 

# 2. Dataset high-level info

The images stored in the **./dataset** folder and its subfolder consists of 150 by 150 pixels greyscale images, representing X-Ray pictures of lungs.

The images, consists of X-ray pictures of the following:

| Description                              | Class index | Tensor  | Class label        |
| ---------------------------------------- | ----------- | ------- | ------------------ |
| People with no infection diagnosis       | 0           | [1 0 0] | normal             |
| People with infected lungs and non-covid | 1           | [0 1 0] | infected_non_covid |
| People with infected lungs and covid     | 2           | [0 0 1] | infected_covid     |



In [None]:
classes = {0: 'normal', 1: 'infected_non_covid', 2: 'infected_covid'}
groups = ['train', 'test', 'val']
dataset_numbers = {
    'train_normal': 1341,
    'train_infected_non_covid': 2530,
    'train_infected_covid': 1345,
    'val_normal': 8,
    'val_infected_non_covid': 8,
    'val_infected_covid': 8,    
    'test_normal': 234,
    'test_infected_non_covid': 242,
    'test_infected_covid': 138,
}
dataset_paths = {
    'train_normal': './dataset/train/normal/',
    'train_infected_non_covid': './dataset/train/infected/non-covid/',
    'train_infected_covid': './dataset/train/infected/covid/',
    'val_normal': './dataset/val/normal/',
    'val_infected_non_covid': './dataset/val/infected/non-covid/',
    'val_infected_covid': './dataset/val/infected/covid/',    
    'test_normal': './dataset/test/normal/',
    'test_infected_non_covid': './dataset/test/infected/non-covid/',
    'test_infected_covid': './dataset/test/infected/covid/',    
}

View one of the images and its properties. These images consist of a Numpy array, with values ranging between 0 and 255. These values will be normalized.

In [None]:
path_to_file = './dataset/train/normal/1.jpg'
with open(path_to_file, 'rb') as f:
    im = np.asarray(Image.open(f))
    plt.imshow(im)
f.close()
print('Image shape is: {}'.format(im.shape))
# Images are defined as a Numpy array of values between 0 and 256
print('Image as a numpy array is:\n {}'.format(im))

# 3. Creating a Dataset object

## 3.1 General Dataset object that is custom made for train, val, test to individually use

length method ( __ len __ )

> return the number of images present in the dataset

getitem method ( __ getitem __ )

> fetch an image and its label, using a single index value. Returns the image, along with a one-hot vector corresponding to the class of the object. Both returned parameters will be torch tensors.
- [1, 0,0] for normal class
- [0, 1, 0] for infected_non_covid class
- [0, 0, 1] for infected_covid class

In [None]:
class Lung_Dataset(Dataset):
    """
    Generic Dataset class.
    """
    
    def __init__(self, groups, dataset_numbers, dataset_paths):
        """
        Constructor for generic Dataset class - assembles
        the important parameters in attributes.

        Parameters
        ----------
        groups : str
            Allowed values: train, val, test
        dataset_numbers : dict
            Count of each class within specified group
        dataset_paths : dict
            Path to each class within specified group
        """

        self.img_size = (150, 150)
        self.classes = {
            0: 'normal',
            1: 'infected_non_covid',
            2: 'infected_covid'
        }        
        self.groups = groups
        self.dataset_numbers = dataset_numbers
        self.dataset_paths = dataset_paths
        
        
    def describe(self):
        """
        Descriptor function.
        Will print details about the dataset when called.
        """
        
        # Generate description
        msg = "This is the {} dataset of the Lung Dataset".format(self.groups)
        msg += " used for the Small Project Demo in the 50.039 Deep Learning class"
        msg += " in March 2021. \n"
        msg += "It contains a total of {} images, ".format(sum(self.dataset_numbers.values()))
        msg += "of size {} by {}.\n".format(self.img_size[0], self.img_size[1])
        msg += "The images are stored in the following locations "
        msg += "and each one contains the following number of images:\n"
        for key, val in self.dataset_paths.items():
            msg += " - {}, in folder {}: {} images.\n".format(key, val, self.dataset_numbers[key])
        print(msg)
        
    
    def open_img(self, group_val, class_val, index_val):
        """
        Opens image with specified parameters.
        
        Parameters:
        - group_val should take values in 'train', 'test' or 'val'.
        - class_val variable should be set to 'normal' or 'infected_non_covid' or 'infected_covid'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        
        Returns loaded image as a normalized Numpy array.
        """
        
        # Asserts checking for consistency in passed parameters
        err_msg = "Error - group_val variable should be set to 'train', 'test' or 'val'."
        assert group_val in self.groups, err_msg
        
        err_msg = "Error - class_val variable should be set to 'normal' or 'infected_non_covid' or 'infected_covid."
        assert class_val in self.classes.values(), err_msg
        
        max_val = self.dataset_numbers['{}_{}'.format(group_val, class_val)]
        err_msg = "Error - index_val variable should be an integer between 0 and the maximal number of images."
        err_msg += "\n(In {}/{}, you have {} images.)".format(group_val, class_val, max_val)
        assert isinstance(index_val, int), err_msg
        assert index_val >= 0 and index_val <= max_val, err_msg
        
        # Open file as before
        path_to_file = '{}/{}.jpg'.format(self.dataset_paths['{}_{}'.format(group_val, class_val)], index_val)
        with open(path_to_file, 'rb') as f:
            # Convert to Numpy array and normalize pixel values by dividing by 255.
            im = np.asarray(Image.open(f))/255
        f.close()
        return im
    
    
    def show_img(self, group_val, class_val, index_val):
        """
        Opens, then displays image with specified parameters.
        
        Parameters:
        - group_val should take values in 'train', 'test' or 'val'.
        - class_val variable should be set to 'normal' or 'infected'.
        - index_val should be an integer with values between 0 and the maximal number of images in dataset.
        """
        
        # Open image
        im = self.open_img(group_val, class_val, index_val)
        
        # Display
        plt.imshow(im)

    def __len__(self):
        """
        Length special method, returns the number of images in dataset.
        """
        
        # Length function
        return sum(self.dataset_numbers.values())
    
    
    def __getitem__(self, index):
        """
        Getitem special method.
        
        Expects an integer value index, between 0 and len(self) - 1.
        
        Returns the image and its label as a one hot vector, both
        in torch tensor format in dataset.
        """
        
        # Get item special method
        first_val = int(list(self.dataset_numbers.values())[0])
        second_val = int(list(self.dataset_numbers.values())[1])
        if index < first_val:
            class_val = 'normal'
            label = torch.Tensor([1, 0, 0])
        elif index < (first_val+second_val):
            class_val = 'infected_non_covid'
            index = index - first_val
            label = torch.Tensor([0, 1, 0])
        else:
            class_val = "infected_covid"
            index = index - (first_val+second_val)
            label = torch.Tensor([0, 0, 1])
        im = self.open_img(self.groups, class_val, index)
        im = transforms.functional.to_tensor(np.array(im)).float()
        return im, label

In [None]:
dataset_numbers = {
    'train': {
        'train_normal': 1341,
        'train_infected_non_covid': 2530,
        'train_infected_covid': 1345,
    },
    'val': {
        'val_normal': 8,
        'val_infected_non_covid': 8,
        'val_infected_covid': 8,
    },
    'test': {
        'test_normal': 234,
        'test_infected_non_covid': 242,
        'test_infected_covid': 138,
    }
}
dataset_paths = {
    'train': {
        'train_normal': './dataset/train/normal/',
        'train_infected_non_covid': './dataset/train/infected/non-covid/',
        'train_infected_covid': './dataset/train/infected/covid/',
    },
    'val': {
        'val_normal': './dataset/val/normal/',
        'val_infected_non_covid': './dataset/val/infected/non-covid/',
        'val_infected_covid': './dataset/val/infected/covid/',
    },
    'test': {
        'test_normal': './dataset/test/normal/',
        'test_infected_non_covid': './dataset/test/infected/non-covid/',
        'test_infected_covid': './dataset/test/infected/covid/',
    }
}

In [None]:
def verify_dataset(group,dataset,image_overall_index=7,class_val='normal',
                   image_specific_dataset_index=1):
  print('Verify the special methods __len__ and __get_item__')
  print('Number of images in {} dataset: {}'.format(group, len(dataset)))
  print('Details for image id {} from the {} dataset'.format(
      image_overall_index,
      group
  ))
  im, class_oh = dataset[image_overall_index]
  print('Sample image shape: {}'.format(im.shape))
  print('Sample image: {}'.format(im))
  print('Sample image class: {}'.format(class_oh))

  print('\nVerify the open_img and show_img functions')
  print('Open and show image {} from the {}_{} dataset'.format(
      image_specific_dataset_index,
      group,
      class_val
  ))
  im = dataset.open_img(group, class_val, image_specific_dataset_index)
  print('Same sample image shape: {}'.format(im.shape))
  print('Same sample image: {}'.format(im))
  dataset.show_img(group, class_val, image_specific_dataset_index)

## 3.2 Train dataset

In [None]:
train_group = 'train'
ld_train = Lung_Dataset(
    train_group,
    dataset_numbers[train_group],
    dataset_paths[train_group]
)
ld_train.describe()

In [None]:
verify_dataset(train_group,ld_train,1)

## 3.3 Validation dataset

In [None]:
val_group = 'val'
ld_val = Lung_Dataset(
    val_group,
    dataset_numbers[val_group],
    dataset_paths[val_group]
)
ld_val.describe()

In [None]:
verify_dataset(val_group,ld_val,1)

## 3.4 Test dataset

In [None]:
test_group = 'test'
ld_test = Lung_Dataset(
    test_group,
    dataset_numbers[test_group],
    dataset_paths[test_group]
)
ld_test.describe()

In [None]:
verify_dataset(test_group,ld_test,1)