# CSE 415 Deep Learning Assignment (A6)
Please go through the entire assignment, and read all lines of code, including the comments. However, only changes to areas of the code marked "**TO DO**" will be graded.

# Part 0: Pytorch Setup Code
Write your name where designated. Format: First name followed by last name with a space in between, and initials capitalized. For example:

name = 'Bindita Chaudhuri'

In [0]:
# This shows how to connect your google drive account with a colab instance.

# Load the Drive helper and mount; this will prompt for authorization 
# (Login to google account; allow access, copy the code and paste it below and then press enter)
from google.colab import drive
drive.mount('/gdrive')

# create a folder named CSE415 in your Drive
import os 
BASE_PATH = '/gdrive/My Drive/CSE415/'
if not os.path.exists(BASE_PATH):
    os.makedirs(BASE_PATH)
    
# now let's test that Google Drive is up and running. 
!ls "/gdrive/My Drive/CSE415"

# The following line will create a text file "foo.txt" in the created folder and then remove it
!echo "Hello Google Drive" > "/gdrive/My Drive/CSE415/foo.txt"
!cat "/gdrive/My Drive/CSE415/foo.txt"
!rm "/gdrive/My Drive/CSE415/foo.txt"

import torch
print('Version', torch.__version__)
print('CUDA enabled:', torch.cuda.is_available())
  
# Running this should then print out:
# Version 1.3.1
# CUDA enabled: True

import torch.nn as nn
import numpy as np
import glob, re, pickle
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

from torchvision import datasets, transforms
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import h5py, tqdm
import sys
sys.path.append(BASE_PATH)

# TO DO
# write your name here in the format mentioned above
name = 'Bindita Chaudhuri'


In [0]:
# Some useful save, restoring and conversion functions are provided below. No need to change them.

!pip3 install matplotlib-label-lines
from labellines import labelLines

class pt_util(object):
    # This does more than the simple Pytorch restore. It checks that the names 
    # of variables match, and if they don't doesn't throw a fit. It is similar 
    # to how Caffe acts. This is especially useful if you decide to change your
    # network architecture but don't want to retrain from scratch.
    @staticmethod
    def restore(net, save_file):
        """ Args:
        net(torch.nn.Module): The net to restore
        save_file(str): The file path
        """
        net_state_dict = net.state_dict()
        restore_state_dict = torch.load(save_file)

        restored_var_names = set()

        print('Restoring:')
        for var_name in restore_state_dict.keys():
            if var_name in net_state_dict:
                var_size = net_state_dict[var_name].size()
                restore_size = restore_state_dict[var_name].size()
                if var_size != restore_size:
                    print('Shape mismatch for var', var_name, 'expected', var_size, 'got', restore_size)
                else:
                    if isinstance(net_state_dict[var_name], torch.nn.Parameter):
                        # backwards compatibility for serialized parameters
                        net_state_dict[var_name] = restore_state_dict[var_name].data
                    try:
                        net_state_dict[var_name].copy_(restore_state_dict[var_name])
                        print(str(var_name)+' -> \t'+str(var_size)+' = '+str(int(np.prod(var_size)*4 / 10**6)) + 'MB')
                        restored_var_names.add(var_name)
                    except:
                        print('While copying the parameter named {}, whose dimensions in the model are'
                              ' {} and whose dimensions in the checkpoint are {}, ...'.format(
                                  var_name, var_size, restore_size))
                        raise

        ignored_var_names = sorted(list(set(restore_state_dict.keys()) - restored_var_names))
        unset_var_names = sorted(list(set(net_state_dict.keys()) - restored_var_names))
        print('')
        if len(ignored_var_names) == 0:
            print('Restored all variables')
        else:
            print('Did not restore:\n\t' + '\n\t'.join(ignored_var_names))
        if len(unset_var_names) == 0:
            print('No new variables')
        else:
            print('Initialized but did not modify:\n\t' + '\n\t'.join(unset_var_names))

        print('Restored %s' % save_file)
        
    # Restores the last saved network in a folder using file write time.
    @staticmethod
    def restore_latest(net, folder):
        """
        Args:
          net(torch.nn.module): The net to restore
          folder(str): The folder path
        Returns:
          int: Attempts to parse the epoch from the state and returns it if possible. Otherwise returns 0.
        """
        checkpoints = sorted(glob.glob(folder + '/*.pt'), key=os.path.getmtime)
        start_it = 0
        if len(checkpoints) > 0:
            pt_util.restore(net, checkpoints[-1])
            start_it = int(re.findall(r'\d+', checkpoints[-1])[-1])
        return start_it

    # Saves the network and optionally deletes old save files. 
    # If num_to_keep is 0, it won't remove any.
    @staticmethod
    def save(net, file_name, num_to_keep=1):
        """
        Args:
        net(torch.nn.module): The network to save
        file_name(str): the path to save the file.
        num_to_keep(int): Specifies how many previous saved states to keep once this one has been saved.
            Defaults to 1. Specifying < 0 will not remove any previous saves.
        """
        folder = os.path.dirname(file_name)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torch.save(net.state_dict(), file_name)
        extension = os.path.splitext(file_name)[1]
        checkpoints = sorted(glob.glob(folder + '/*' + extension), key=os.path.getmtime)
        print('Saved %s\n' % file_name)
        if num_to_keep > 0:
            for ff in checkpoints[:-num_to_keep]:
                os.remove(ff)
                

    @staticmethod
    def to_numpy(array):
        if isinstance(array, torch.Tensor):
            return array.detach().cpu().numpy()
        elif isinstance(array, dict):
            return {key: pt_util.to_numpy(val) for key, val in array.items()}
        else:
            return np.asarray(array)

    @staticmethod
    def from_numpy(np_array):
        if isinstance(np_array, list):
            try:
                np_array = np.stack(np_array, 0)
            except ValueError:
                np_array = np.stack([from_numpy(val) for val in np_array], 0)
        elif isinstance(np_array, dict):
            return {key: from_numpy(val) for key, val in np_array.items()}
        np_array = np.asarray(np_array)
        if np_array.dtype == np.uint32:
            print("numpy -> torch dtype uint32 not supported, using int32")
            np_array = np_array.astype(np.int32)
        elif np_array.dtype == np.dtype("O"):
            print("numpy -> torch dtype Object not supported, returning numpy array")
            return np_array
        elif np_array.dtype.type == np.str_:
            print("numpy -> torch dtype numpy.str_ not supported, returning numpy array")
            return np_array
        return torch.from_numpy(np_array)

    @staticmethod
    def write_log(filename, data):
        """Pickles and writes data to a file
        Args:
            filename(str): File name
            data(pickleable object): Data to save
        """
        if not os.path.exists(os.path.dirname(filename)):
          os.makedirs(os.path.dirname(filename))
        pickle.dump(data, open(filename, 'wb')) 

    def read_log(filename, default_value=None):
        """Reads pickled data or returns the default value if none found
        Args:
            filename(str): File name
            default_value(anything): Value to return if no file is found
        Returns:
            unpickled file
        """
        if os.path.exists(filename):
            return pickle.load(open(filename, 'rb'))
        return default_value
    
    # Create plots
    @staticmethod
    def plot(x_values, y_values, title, xlabel, ylabel, plotlabel):
        """Plots a line graph
        Args:
            x_values(list or np.array): x values for the line
            y_values(list or np.array): y values for the line
            title(str): Title for the plot
            xlabel(str): Label for the x axis
            ylabel(str): label for the y axis
        """
        plt.figure(figsize=(20, 10))
        plt.plot(x_values, y_values, label=plotlabel, color='b')
        labelLines(plt.gca().get_lines(), color='b',fontsize=13)
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.show()

    def to_scaled_uint8(array):
        """Returns a normalized uint8 scaled to 0-255. This is useful for showing images especially of floats.
        Args:
            array(np.array): The array to normalize
        Returns:
            np.array normalized and of type uint8
        """
        array = np.array(array, dtype=np.float32)
        array -= np.min(array)
        array *= (255. / np.max(array))
        array = array.astype(np.uint8)
        return array

# Part 1: Classification Network for CIFAR-10

# 1.1 Loading and transforming data

CIFAR-10 data consists of 60000 32x32 images, belonging to 10 classes. More details can be found here:
https://www.cs.toronto.edu/~kriz/cifar.html

Note: Each image is a matrix of shape width x height (W x H) with 3 channels (C).

The code for loading the dataset and transforming the data accordingly is provided. 
The transform *ToTensor* converts the output from the dataset to be a tensor in CxHxW format.
One type of data augmentation - data normalization, is shown in the transforms. You have to add at least 2 of the following data augmentations in 'transform_train' (try to understand which ones you need, and what parameters you should choose for them):

- RandomHorizontalFlip
- RandomCrop
- ColorJitter
- RandomRotation


# 1.2 Defining network

You need to define a convolutional neural network (CNN) to classify the data into its classes. Sample code for classification of MNIST (handwritten digits dataset) is given here: https://github.com/pytorch/examples/blob/master/mnist/main.py

You will define the network layers in the `__init__` function, and the forward call will pass the data through those layers. The network should be as follows:

- One 3x3 convolution layer with 32 filters.
- One 2x2 maxpooling layer
- One 3x3 convolution layer with 64 filters. 
- One 3x3 convolution layer with 64 filters. 
- One 2x2 maxpooling layer
- One 3x3 convolution layer with 128 filters. 
- One 3x3 convolution layer with 128 filters.
- One 2x2 maxpooling layer
- One fully connected layer with 512 outputs.
- Then the final classification layer with 10 outputs.

- Every convolution layer should be followed by ReLU nonlinearity followed by batch normalization. Use padding in every convolution layer to retain input image size.
        


# Helpful functions
- https://pytorch.org/docs/stable/tensors.html
- https://pytorch.org/docs/stable/nn.html
- You can call `pt_util.to_numpy(x)` to get a numpy array from a torch tensor x.
- `pt_util.from_numpy(x)` makes a torch Tensor from the numpy array x.

# Common Oopsies
- __Q__ It only runs for one iteration and says it's done: __A__ We provided code that automatically loads the most recent file. If you don't want to start from that checkpoint, simply find it in your google drive and delete it.
- __Q__ I want to save more than just the last checkpoint: __A__ You can change the save function to save any number of previous checkpoints. You can also tell it to save all of them (not delete anything), by passing in 0.
- __Q__ Pytorch is saying things are the wrong shape: __A__ You can easily reshape things using the `view` function (https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view). It is like the Numpy `reshape` function.
- __Q__ Pytorch is saying things are on the wrong device: __A__ You can move data between devices with the `.to(device)` call. Generally, all arguments to a function will need to be on the same device.

In [0]:
# Data augmentation

transform_train = transforms.Compose([
    # TO DO
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Load the dataset

DATA_PATH = BASE_PATH + 'cifar10/'
data_train = datasets.CIFAR10(root=DATA_PATH, train=True, download=True, transform=transform_train)
data_test = datasets.CIFAR10(root=DATA_PATH, train=False, download=True, transform=transform_test)


# This is where you define your network architecture.
# You can use that as a guide, but make sure you understand what it all does.

class CifarNet(nn.Module):
    def __init__(self):
        super(CifarNet, self).__init__()
        self.best_accuracy = -1
        # TO DO
        raise NotImplementedError('Define the layers here')
        
        
    def forward(self, x):
        # TO DO
        raise NotImplementedError('Define the forward pass')
        
      
    def loss(self, prediction, label, reduction='elementwise_mean'):
        loss_val = F.cross_entropy(prediction, label.squeeze(), reduction=reduction)
        return loss_val

    def save_model(self, file_path, num_to_keep=1):
        pt_util.save(self, file_path, num_to_keep)
        
    def save_best_model(self, accuracy, file_path, num_to_keep=1):
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            self.save_model(file_path, num_to_keep)

    def load_model(self, file_path):
        pt_util.restore(self, file_path)

    def load_last_model(self, dir_path):
        return pt_util.restore_latest(self, dir_path)

# 1.3 Training and testing the network

The train and test functions are given below. Fill in the TO DO with the given instructions. At the end, training and testing loss and accuracy plots will be displayed. Right click on the plots and save them as png files. Put these images on a single page (titled **CIFAR plots**) in your report, which you can create as a doc file but you will need to convert it into a pdf for submission. You also need to submit the 'cifar_050.pt' file from the 'checkpoints' folder inside your 'cifar10' folder.

In [0]:
import time

def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    losses = []
    for batch_idx, (data, label) in enumerate(train_loader):
        # send the data and labels to GPU
        data, label = data.to(device), label.to(device)
        # initialize the optimizer
        optimizer.zero_grad()
        # TO DO (call the 'model' with 'data' as input)
        output = None
        # TO DO (call the model.loss function with 'output' and 'label' as inputs) 
        loss = None
        
        # backpropagate the loss
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        
        # Log the training progress
        if batch_idx % log_interval == 0:
            print('{} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                time.ctime(time.time()),
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return np.mean(losses)

def test(model, device, test_loader, log_interval=None):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for batch_idx, (data, label) in enumerate(test_loader):
            # send data and label to GPU
            data, label = data.to(device), label.to(device)
            # TO DO (call the model with 'data' as input)
            output = None
            # TO DO (call model.loss function with 'output and 'label' as inputs and reduction='sum')
            test_loss_on = None
            
            test_loss += test_loss_on.item()
            
            # Take the class with maximum probability as the output class.
            pred = output.max(1)[1]
            correct_mask = pred.eq(label.view_as(pred))
            num_correct = correct_mask.sum().item()
            correct += num_correct
            
            # Log the test progress
            if log_interval is not None and batch_idx % log_interval == 0:
                print('{} Test: [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    time.ctime(time.time()),
                    batch_idx * len(data), len(test_loader.dataset),
                    100. * batch_idx / len(test_loader), test_loss_on))

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset), test_accuracy))
    return test_loss, test_accuracy


In [0]:
# Now the actual training and testing code

import multiprocessing
import traceback

# Play around with these constants, you may find a better setting.
BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001
MOMENTUM = 0.9
USE_CUDA = True
SEED = 0
PRINT_INTERVAL = 100
WEIGHT_DECAY = 0.0005
LOG_PATH = DATA_PATH + 'log.pkl'

# handle GPU connection and multiprocessing
use_cuda = USE_CUDA and torch.cuda.is_available()
torch.manual_seed(SEED)

device = torch.device("cuda" if use_cuda else "cpu")
print('Using device', device)
import multiprocessing
print('num cpus:', multiprocessing.cpu_count())
kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}

class_names = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

# iterator over the data
train_loader = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(data_test, batch_size=TEST_BATCH_SIZE, shuffle=False, **kwargs)

# Initialize network and send it to GPU
model = CifarNet().to(device)
# Define optimizer
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# This will train from scratch
start_epoch = 0 
train_losses, test_losses, test_accuracies = [], [], []

# To resume training from last saved model, uncomment the following 2 lines
# start_epoch = model.load_last_model(DATA_PATH + 'checkpoints')
# train_losses, test_losses, test_accuracies = pt_util.read_log(LOG_PATH, ([], [], []))

# Get the initial test losses and accuracies
test_loss, test_accuracy = test(model, device, test_loader)
test_losses.append((start_epoch, test_loss))
test_accuracies.append((start_epoch, test_accuracy))

try:
    for epoch in range(start_epoch, EPOCHS + 1):
        # train the model for 1 epoch
        train_loss = train(model, device, train_loader, optimizer, epoch, PRINT_INTERVAL)
        train_losses.append((epoch, train_loss))
        # test the model after 1 epoch
        test_loss, test_accuracy = test(model, device, test_loader)
        test_losses.append((epoch, test_loss))
        test_accuracies.append((epoch, test_accuracy))
        # Log the losses and accuracy
        pt_util.write_log(LOG_PATH, (train_losses, test_losses, test_accuracies))
        # save the current model in the checkpoints folder
        model.save_best_model(test_accuracy, DATA_PATH + 'checkpoints/cifar_%03d.pt' % epoch)


except KeyboardInterrupt as ke:
    print('Interrupted')
except:
    import traceback
    traceback.print_exc()
finally:
    model.save_model(DATA_PATH + 'checkpoints/cifar_%03d.pt' % epoch, 0)
    # Plot the loss and accuracy values over epochs
    ep, val = zip(*train_losses)
    pt_util.plot(ep, val, 'Train loss', 'Epoch', 'Error', name)
    ep, val = zip(*test_losses)
    pt_util.plot(ep, val, 'Test loss', 'Epoch', 'Error', name)
    ep, val = zip(*test_accuracies)
    pt_util.plot(ep, val, 'Test accuracy', 'Epoch', 'Error', name)


# 1.4 Attempt to use your own test data

Download 5 images for each of the 4 classes given below. Choose the images randomly from Google, but make sure they look somewhat similar to the training data of CIFAR-10 in terms of content. For example, full body image of a dog with minimal background is recommended. Edit the text below to write down the links of the images (**NOTE**: The TAs will be evaluating YOUR network weights with YOUR chosen images, so do not collaborate with others regarding image choices.)

Plane:

- link 1
- link 2
- link 3
- link 4
- link 5

Car:

- link 1
- link 2
- link 3
- link 4
- link 5

Dog:

- link 1
- link 2
- link 3
- link 4
- link 5

Horse:

- link 1
- link 2
- link 3
- link 4
- link 5

Now upload these images in the 'cifar10' folder in your drive. Then write a function show_images() to plot the images in a 4 x 5 grid using matplotlib. Functions to use among others:

- plt.subplot() or fig.add_subplot()
- plt.imshow() or ax.imshow()
- plt.axis('off') or ax.set_axis_off()
- plt.show()
- Image.open() from PIL (convert to numpy array before use in imshow) 

Once done, right click on the grid and click 'Save Image As'. Put the image in your report on a page (titled **External Images**).

# EXTRA CREDIT (20 points)
Use these images as the test set and check the accuracy of your trained model on this custom test set. A useful article: https://towardsdatascience.com/building-efficient-custom-datasets-in-pytorch-2563b946fd9f


In [0]:
# TO DO : image visualization; you can modify anything here
from PIL import Image
def show_images():
    raise NotImplementedError('Create the grid to visualize images here:')

# EXTRA CREDIT CODE HERE (IF ANY)

# Part 2: Language generation using character-level RNNs

# 2.1 Processing the data

We'll be using the complete text of Harry Potter as our corpus. Place the harry_potter.txt file provided in the 'language' folder created in your drive.

1) Create a dictionary voc2ind to define the vocabulary. voc2ind contents will be like `{' ': 0, 'A': 1, 'B': 2, 'C': 3, ... '0': 35, '1': 36, ..., '$': 78, '#': 79, '(': 80, ...}`. The keys include all the 26 alphabets, both upper case and lower case separately, digits 0-9 and special characters that may appear in a general text. The values corresponding to the keys can be in any order. You may have to handle single and double quotes carefully in the keys since they are generally used as string delimiters. ind2voc is the inverse of voc2ind.

2) Now using the vocabulary above, convert text into a list of tokens. For example, if the text is **`"ABA CDBE"`**, the token version will be a list with contents `[1, 2, 1, 0, 3, 4, 2, 5]`. Create this list where instructed below.

3) train_text and test_text should contain the first 80% and the last 20% of the 'token' list. This should be simple to write.


In [0]:
DATA_PATH = BASE_PATH + 'language/'
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

def prepare_data(data_path):
    with open(data_path) as f:
        # This reads all the data from the file, but does not do any processing on it.
        data = f.read()
    
    # TO DO (create the voc2ind dictionary)
    voc2ind = {}
    
    # transform the data into an integer representation of the tokens.
    token = []
    for char in data:
        # replaces all weird spacing like tab, next line etc. with space
        if re.match('\s+', char): char = ' '
        # TO DO (create the list of tokens; basically replace None with the correct expression)
        token.append(None)

    ind2voc = {val: key for key, val in voc2ind.items()}

    # TO DO 
    train_text = None
    test_text = None

    pickle.dump({'tokens': train_text, 'ind2voc': ind2voc, 'voc2ind':voc2ind}, open(DATA_PATH + 'harry_potter_chars_train.pkl', 'wb'))
    pickle.dump({'tokens': test_text, 'ind2voc': ind2voc, 'voc2ind':voc2ind}, open(DATA_PATH + 'harry_potter_chars_test.pkl', 'wb'))
    print('Data prepared!')
    
prepare_data(DATA_PATH + 'harry_potter.txt')

class Vocabulary(object):
    def __init__(self, data_file):
        with open(data_file, 'rb') as data_file:
            dataset = pickle.load(data_file)
        self.ind2voc = dataset['ind2voc']
        self.voc2ind = dataset['voc2ind']

    # Returns a string representation of the tokens.
    def array_to_words(self, arr):
        return ''.join([self.ind2voc[int(ind)] for ind in arr])

    # Returns a torch tensor representing each token in words.
    def words_to_array(self, words):
        return torch.LongTensor([self.voc2ind[word] for word in words])

    # Returns the size of the vocabulary.
    def __len__(self):
        return len(self.voc2ind)

Now we have to load the data. There's nothing to do here on your part, but this is an explanation of what is going on. First imagine splitting the dataset into N chunks where N is the batch_size and the chunks are contiguous parts of the data. For each batch, we should return one sequence from each of the chunks. The batches should also be sequential an example is described below.

The data is 20 characters long `[1, 2, 3, ...20]`. The batch size is 2 and the sequence length is 4
- The 1st batch should consist of  `(data =  [[1, 2, 3, 4]; [11, 12, 13, 14]], labels = [[2, 3, 4, 5]; [12, 13, 14, 15]])`
- The 2nd batch should consist of `(data =  [[5, 6, 7, 8]; [15, 16, 17, 18]], labels = [[6, 7, 8, 9]; [16, 17, 18, 19]])`
- The 3rd batch should consist of `(data =  [[9]; [19]], labels = [[10]; [20]])`
- There is no 4th batch.

Note:
- It is OK to have one batch be shorter than the others as long as all entries in that batch are the same length.
- The last label in one batch is the first data in the next batch.

In [0]:
class HarryPotterDataset(torch.utils.data.Dataset):
    def __init__(self, data_file, sequence_length, batch_size):
        super(HarryPotterDataset, self).__init__()

        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.vocab = Vocabulary(data_file)

        with open(data_file, 'rb') as data_pkl:
            dataset = pickle.load(data_pkl)

        # make dataset length a multiple of batch size
        # removing the last bit to make the data the proper shape mostly gives better results than padding with 0s.
        new_dataset_length = (len(dataset['tokens'])//self.batch_size) * self.batch_size        
        self.tokens = dataset['tokens'][:new_dataset_length]

        self.chunk_size = int(new_dataset_length / self.batch_size)

    def __len__(self):
        # return the number of unique sequences you have, not the number of characters.
        return self.batch_size * int(np.ceil((self.chunk_size-1)/self.sequence_length))
        
    def __getitem__(self, idx):
        # Return the data and label for a character sequence
        # Return a single entry for the batch using the idx to decide which chunk you are in and how far down in the chunk you are.
        chunk_idx = idx % self.batch_size
        pos_idx = idx // self.batch_size

        start_idx = self.chunk_size * chunk_idx + self.sequence_length * pos_idx
        end_idx = self.chunk_size * chunk_idx + min(self.sequence_length * pos_idx + self.sequence_length + 1, self.chunk_size)

        data = self.tokens[start_idx : end_idx]
        # The data and labels should be torch long tensors.
        return torch.LongTensor(data[:-1]), torch.LongTensor(data[1:])

    def vocab_size(self):
        return len(self.vocab)

# 2.2 Defining the network

The network, a generic RNN is defined below for you. Just replace the GRU layer with an LSTM layer. You can look up LSTM definition in Pytorch online. Then, implement the forward pass as instructed.

In [0]:
TEMPERATURE = 0.5

class HarryPotterNet(nn.Module):
    def __init__(self, vocab_size, feature_size):
        super(HarryPotterNet, self).__init__()
        self.vocab_size = vocab_size
        self.feature_size = feature_size
        self.encoder = nn.Embedding(self.vocab_size, self.feature_size)
        # TO DO (replace the following line with LSTM layer)
        self.gru = nn.GRU(self.feature_size, self.feature_size, batch_first=True)
        self.decoder = nn.Linear(self.feature_size, self.vocab_size)
        
        # This shares the encoder and decoder weights as described in lecture.
        self.decoder.weight = self.encoder.weight
        self.decoder.bias.data.zero_()
        
        self.best_accuracy = -1
    
    def forward(self, x, hidden_state=None):
        batch_size = x.shape[0]
        sequence_length = x.shape[1]
        
        # TO DO 
        # pass x though the encoder first, then through LSTM, then through the decoder.
        # return items from LSTM layer should be both the output and the hidden state.
        raise NotImplementedError 

        return x, hidden_state

    # This defines the function that gives a probability distribution and implements the temperature computation.
    def inference(self, x, hidden_state=None, temperature=1):
        x = x.view(-1, 1)
        x, hidden_state = self.forward(x, hidden_state)
        x = x.view(1, -1)
        x = x / max(temperature, 1e-20)
        x = F.softmax(x, dim=1)
        return x, hidden_state

    # Predefined loss function
    def loss(self, prediction, label, reduction='mean'):
        loss_val = F.cross_entropy(prediction.view(-1, self.vocab_size), label.view(-1), reduction=reduction)
        return loss_val

    # Saves the current model
    def save_model(self, file_path, num_to_keep=1):
        pt_util.save(self, file_path, num_to_keep)

    # Saves the best model so far
    def save_best_model(self, accuracy, file_path, num_to_keep=1):
        if accuracy > self.best_accuracy:
            self.save_model(file_path, num_to_keep)
            self.best_accuracy = accuracy

    def load_model(self, file_path):
        pt_util.restore(self, file_path)

    def load_last_model(self, dir_path):
        return pt_util.restore_latest(self, dir_path)

# 2.3 Character generation and training

You don't have to write any code in this part. Just understand how the steps work. The training and testing functions and the main function are quite similar to the ones in Part 1. Here is an interesting article about sampling strategies (we have used max sampling strategy here for convenience):
https://towardsdatascience.com/how-to-sample-from-language-models-682bceb97277

Feel free to modify various portions of the code to understand better, and refer to online resources. Natural Language Processing details are beyond the scope of this course.

At the end, save the 3 plots (train loss, test loss and test accuracy) as before and put them in your report on a single page (titled **NLP plots**). In addition, write down (in your report) the generated sentence that will be displayed at the end. You will also need to submit 'language_020.pt' file from 'checkpoints' folder in your 'language' folder.


In [0]:
TEMPERATURE = 0.5
BEAM_WIDTH = 10
import tqdm

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    

def max_sampling_strategy(sequence_length, model, output, hidden, vocab):
    outputs = []
    ind = torch.argmax(output)
    for ii in range(sequence_length):
        output, hidden = model.inference(ind, hidden, TEMPERATURE)
        ind = torch.argmax(output)
        outputs.append(ind)
    return outputs

def generate_language(model, device, seed_words, sequence_length, vocab, sampling_strategy='max', beam_width=BEAM_WIDTH):
    model.eval()

    with torch.no_grad():
        seed_words_arr = vocab.words_to_array(seed_words)

        # Computes the initial hidden state from the prompt (seed words).
        hidden = None
        for ind in seed_words_arr:
            data = ind.to(device)
            output, hidden = model.inference(data, hidden)
        
        outputs = max_sampling_strategy(sequence_length, model, output, hidden, vocab)
        return vocab.array_to_words(seed_words_arr.tolist() + outputs)
    

def train(model, device, optimizer, train_loader, lr, epoch, log_interval):
    model.train()
    losses = []
    hidden = None
    for batch_idx, (data, label) in enumerate(tqdm.tqdm(train_loader)):
        # send data and label to GPU
        data, label = data.to(device), label.to(device)
        # Separates the hidden state across batches.
        # Otherwise the backward would try to go all the way to the beginning every time.
        if hidden is not None:
            hidden = repackage_hidden(hidden)
               
        optimizer.zero_grad()
        # run the model and get the prediction
        output, hidden = model(data)
        pred = output.max(-1)[1]
        
        # compute and backpropagate the loss
        loss = model.loss(output, label)
        losses.append(loss.item())
        loss.backward()
        
        optimizer.step()
        
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    return np.mean(losses)


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        hidden = None
        for batch_idx, (data, label) in enumerate(test_loader):
            # send data and label to GPU
            data, label = data.to(device), label.to(device)
            
            # test the model
            output, hidden = model(data, hidden)
            pred = output.max(-1)[1]
            correct_mask = pred.eq(label.view_as(pred))
            num_correct = correct_mask.sum().item()
            correct += num_correct
            
            # compute test loss
            test_loss += model.loss(output, label, reduction='mean').item()
            
            # Comment this out to avoid printing test results
            if batch_idx % 10 == 0:
                print('Input\t%s\nGT\t%s\npred\t%s\n\n' % (
                    test_loader.dataset.vocab.array_to_words(data[0]),
                    test_loader.dataset.vocab.array_to_words(label[0]),
                    test_loader.dataset.vocab.array_to_words(pred[0])))

    test_loss /= len(test_loader)
    test_accuracy = 100. * correct / (len(test_loader.dataset) * test_loader.dataset.sequence_length)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset) * test_loader.dataset.sequence_length,
        100. * correct / (len(test_loader.dataset) * test_loader.dataset.sequence_length)))
    return test_loss, test_accuracy


In [0]:
def main():
    SEQUENCE_LENGTH = 100
    BATCH_SIZE = 256
    FEATURE_SIZE = 512
    TEST_BATCH_SIZE = 256
    EPOCHS = 20
    LEARNING_RATE = 0.002
    WEIGHT_DECAY = 0.0005
    USE_CUDA = True
    PRINT_INTERVAL = 10
    LOG_PATH = DATA_PATH + 'logs/log.pkl'

    # setup GPU and multiprocessing
    use_cuda = USE_CUDA and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    print('Using device', device)
    import multiprocessing
    num_workers = multiprocessing.cpu_count()
    print('num workers:', num_workers)
    kwargs = {'num_workers': num_workers,'pin_memory': True} if use_cuda else {}

    # get train and test data
    data_train = HarryPotterDataset(DATA_PATH + 'harry_potter_chars_train.pkl', SEQUENCE_LENGTH, BATCH_SIZE)
    data_test = HarryPotterDataset(DATA_PATH + 'harry_potter_chars_test.pkl', SEQUENCE_LENGTH, TEST_BATCH_SIZE)
    vocab = data_train.vocab
    train_loader = torch.utils.data.DataLoader(data_train, batch_size=BATCH_SIZE, shuffle=False, **kwargs)
    test_loader = torch.utils.data.DataLoader(data_test, batch_size=TEST_BATCH_SIZE, shuffle=False, **kwargs)

    # initialize model
    model = HarryPotterNet(data_train.vocab_size(), FEATURE_SIZE).to(device)

    # Adam is an optimizer like SGD but a bit fancier. It tends to work faster and better than SGD.
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    # This will train from scratch
    start_epoch = 0 
    train_losses, test_losses, test_accuracies = [], [], []

    # To resume training from last saved model, uncomment the following 2 lines
    # start_epoch = model.load_last_model(DATA_PATH + 'checkpoints')
    # train_losses, test_losses, test_accuracies = pt_util.read_log(LOG_PATH, ([], [], []))

    # Get initial test loss and accuracy
    test_loss, test_accuracy = test(model, device, test_loader)
    test_losses.append((start_epoch, test_loss))
    test_accuracies.append((start_epoch, test_accuracy))

    try:
        for epoch in range(start_epoch, EPOCHS + 1):
            lr = LEARNING_RATE * np.power(0.25, (int(epoch / 6)))
            
            # train for 1 epoch
            train_loss = train(model, device, optimizer, train_loader, lr, epoch, PRINT_INTERVAL)
            train_losses.append((epoch, train_loss))
            # test after 1 epoch
            test_loss, test_accuracy = test(model, device, test_loader)
            test_losses.append((epoch, test_loss))
            test_accuracies.append((epoch, test_accuracy))
            # Log the losses and accuracy
            pt_util.write_log(LOG_PATH, (train_losses, test_losses, test_accuracies))
            # save best model
            model.save_best_model(test_accuracy, DATA_PATH + 'checkpoints/language_%03d.pt' % epoch)
            
            # Test with a sentence starting with the given seed words. See what is generated next.
            seed_words = 'Harry Potter, Voldemort, and Dumbledore walk into a bar. '
            generated_sentence = generate_language(model, device, seed_words, 200, vocab, 'max')
            print('generated sentence\t\t', generated_sentence)
            print('')

    except KeyboardInterrupt as ke:
        print('Interrupted')
    except:
        import traceback
        traceback.print_exc()
    finally:
        print('Saving final model')
        model.save_model(DATA_PATH + 'checkpoints/language_%03d.pt' % epoch, 0)
        # plot the loss and accuracy values
        ep, val = zip(*train_losses)
        pt_util.plot(ep, val, 'Train loss', 'Epoch', 'Error', name)
        ep, val = zip(*test_losses)
        pt_util.plot(ep, val, 'Test loss', 'Epoch', 'Error', name)
        ep, val = zip(*test_accuracies)
        pt_util.plot(ep, val, 'Test accuracy', 'Epoch', 'Error', name)
        return model, vocab, device

final_model, vocab, device = main()

# Submission.
Download a copy of this python notebook using File -> download .ipynb. Upload the following files on Canvas INDIVIDUALLY (**Do Not ZIP**):

- cse415A6.ipynb
- report.pdf
- cifar_050.pt
- language_020.pt

P.S. This codebase has been written following the codebase for the homework assignments of CSE499G1 / CSE599G1 (Introduction to Deep Learning) course during Fall 2019.