### NOTE: Search for "### FILL IN ###" to find areas of problem that you are expected to work on.

In [None]:
import cv2
import numpy as np
import urllib.request
from google.colab.patches import cv2_imshow

import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
import random
import os
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

#seed everything for reproducability
def seed_everything(seed=1234):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()


# **PART A: CNN [70 POINTS]**

**Q1: CNN basics [30 POINTS]**


The convolution function takes in an image, a kernel, a stride, and a padding size, and returns the output of the convolution operation. The image and kernel are numpy arrays, where image is of shape (height, width, channels) and kernel is of shape (kernel_height, kernel_width, channels, num_filters). The stride and padding are integers that specify the stride size and padding size, respectively.

<br>

Note that this implementation will assume that the input image has a depth of channels and that the kernel has the same depth. 

<br>

Apply convolution operation on the input image with the specified kernel, stride, and padding.

It takes arguments

    - image: numpy array of shape (height, width, channels)
    - kernel: numpy array of shape (kernel_height, kernel_width, channels, num_filters)
    - stride: integer, the stride size
    - padding: integer, the padding size

and returns

    - numpy array of shape (output_height, output_width, num_filters)

In [None]:
def convolution(image, kernel, stride, padding):
    height, width, channels = image.shape
    kernel_height, kernel_width, _, num_filters = kernel.shape

    ### FILL IN ### [5 POINTS]
    # Calculate output shape
    output_height = 
    output_width = 

    ### FILL IN ### [5 POINTS]
    # Add padding to the image if any
    if padding > 0:
        image_padded = 
    else:
        image_padded = 

    ### FILL IN ### [10 POINTS]
    # Initialize output array
    output = np.zeros((output_height, output_width, num_filters))
    # Apply convolution

    return output


In [None]:
# Download sample image from OpenCV repository
url = 'https://raw.githubusercontent.com/opencv/opencv/master/samples/data/board.jpg'
filename = 'sample_image.jpg'
urllib.request.urlretrieve(url, filename)

In [None]:
### you should see sample_image.jpg present

! ls

In [None]:
# Load sample image
image = cv2.imread('sample_image.jpg')
cv2_imshow(image)

In [None]:
image.shape

In [None]:
kernel_1 = np.zeros((3, 3, 3))

kernel_1[:,:,0] = kernel_1[:,:,1] = kernel_1[:,:,2] = np.array([[[1, 0, -1],
                                                                [1, 0, -1],
                                                                [1, 0, -1]]])

kernel_1 = kernel_1.reshape(3, 3, 3, 1)

In [None]:
# Apply convolution with stride=1 and padding=1
output_1 = convolution(image, kernel_1, stride=1, padding=1)
cv2_imshow(output_1)

In [None]:
# Apply convolution with stride=3 and padding=1
output_1b = convolution(image, kernel_1, stride=3, padding=1)
cv2_imshow(output_1b)

In [None]:
### restoring the size to see differences
output_1c = cv2.resize(output_1b, (image.shape[1], image.shape[0])) 
cv2_imshow(output_1c)

In [None]:
kernel_2 = np.zeros((3, 3, 3))

kernel_2[:,:,0] = kernel_2[:,:,1] = kernel_2[:,:,2] = np.array([[1, 1, 1],
                                                                [0, 0, 0],
                                                                [-1, -1, -1]])

kernel_2 = kernel_2.reshape(3, 3, 3, 1)

In [None]:
# Apply convolution with stride=1 and padding=1
output_2 = convolution(image, kernel_2, stride=1, padding=1)
cv2_imshow(output_2)

"### FILL IN ###" [3 POINTS]

What is the operation kernel_1 responsible for?

Answer: 

<hr>

"### FILL IN ###" [3 POINTS]

What is the operation kernel_2 responsible for?

Answer: 

<hr>

"### FILL IN ###" [4 POINTS]

As seen in output_1b and output_1c , what happens when we increase the stride value ? 

Answer: 

<hr>


**Q2: CNN vs ANN [40 POINTS]**

Let's consider the MNIST dataset, which contains images of handwritten digits. Each image is 28 pixels wide and 28 pixels tall, for a total of 784 pixels. The task is to classify each image into one of ten possible classes (0-9).

<br>

In the Menu, change runtime type to GPU for faster computations

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

# Define data loaders
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, shuffle=False)

In [None]:
import matplotlib.pyplot as plt

# Get a batch of images from the data loader
images, labels = next(iter(train_loader))

# Plot the images
fig, axs = plt.subplots(4, 8, figsize=(10, 5))

for i in range(4):
    for j in range(8):
        axs[i, j].imshow(np.squeeze(images[i*8+j]), cmap='gray')
        axs[i, j].axis('off')

plt.show()

In [None]:
# Define the neural network model 
# Use single hidden layer of 128 nodes
# Use ReLU activation
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        ### FILL IN ### [5 POINTS]


    def forward(self, x):
        ### FILL IN ### [5 POINTS]
        
        return x

In [None]:
# Define the convolutional neural network model 
# Use first convolution layer of 32 filters of size of 3x3 and a stride of 1
# Use second convolution layer of 64 filters of size of 3x3 and a stride of 1
# Use max pooling layer with kernel size of 2x2 and a stride of 2.
# Use single hidden layer of 128 nodes
# Use ReLU activation
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        ### FILL IN ### [10 POINTS]


    def forward(self, x):
        ### FILL IN ### [5 POINTS]

        return x

In [None]:
# Initialize models and optimizer
neural_net = NeuralNet().to(device)
conv_net = ConvNet().to(device)
criterion = nn.CrossEntropyLoss()
neural_net_optimizer = optim.Adam(neural_net.parameters(), lr=0.001)
conv_net_optimizer = optim.Adam(conv_net.parameters(), lr=0.001)

In [None]:
# Train the neural network
for epoch in range(5):
    print("epoch = ",epoch)
    neural_net.train()
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        # Forward pass
        neural_net_predictions = neural_net(data)
        neural_net_loss = criterion(neural_net_predictions, targets)

        # Backward pass
        neural_net_optimizer.zero_grad()
        neural_net_loss.backward()
        neural_net_optimizer.step()

    # Evaluate the neural network
    neural_net.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for data, targets in test_loader:
            data = data.to(device)
            targets = targets.to(device)
            neural_net_predictions = neural_net(data)
            _, predicted = torch.max(neural_net_predictions.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    print("test accuracy = ",correct/total)

In [None]:
# Train the convolutional neural network
for epoch in range(5):
    print("epoch = ",epoch)
    conv_net.train()
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        # Forward pass
        conv_net_predictions = conv_net(data)
        conv_net_loss = criterion(conv_net_predictions, targets)

        # Backward pass
        conv_net_optimizer.zero_grad()
        conv_net_loss.backward()
        conv_net_optimizer.step()

    # Evaluate the conv neural network
    conv_net.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for data, targets in test_loader:
            data = data.to(device)
            targets = targets.to(device)
            conv_net_predictions = conv_net(data)
            _, predicted = torch.max(conv_net_predictions.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    print("test accuracy = ",correct/total)

Performing some lateral shifts in these images and see performance differences

In [None]:
import matplotlib.pyplot as plt

# Get a batch of images from the data loader
images, labels = next(iter(test_loader))

# Plot the images
fig, axs = plt.subplots(3, 3, figsize=(5, 5))

for i in range(3):
    for j in range(3):
        axs[i, j].imshow(np.squeeze(images[i*8+j]), cmap='gray')
        axs[i, j].axis('off')

plt.show()

In [None]:
# Define the transformation
shifted_transform = transforms.Compose([
                    transforms.RandomAffine(degrees=0, translate=(0.2, 0.2)),
                    transforms.ToTensor(),
                ])

test_shifted_dataset = datasets.MNIST(root='./data', train=False, transform=shifted_transform)
test_shifted_loader = torch.utils.data.DataLoader(test_shifted_dataset, batch_size=128, shuffle=False)

In [None]:
# Get a batch of images from the data loader
images, labels = next(iter(test_shifted_loader))

# Plot the images
fig, axs = plt.subplots(3, 3, figsize=(5, 5))

for i in range(3):
    for j in range(3):
        axs[i, j].imshow(np.squeeze(images[i*8+j]), cmap='gray')
        axs[i, j].axis('off')

plt.show()

In [None]:
neural_net.eval()
correct, total = 0, 0
with torch.no_grad():
    for data, targets in test_shifted_loader:
        data = data.to(device)
        targets = targets.to(device)
        neural_net_predictions = neural_net(data)
        _, predicted = torch.max(neural_net_predictions.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print("test accuracy = ",correct/total)

In [None]:
conv_net.eval()
correct, total = 0, 0
with torch.no_grad():
    for data, targets in test_shifted_loader:
        data = data.to(device)
        targets = targets.to(device)
        conv_net_predictions = conv_net(data)
        _, predicted = torch.max(conv_net_predictions.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print("test accuracy = ",correct/total)

Applying shifts of higher magnitude 

In [None]:
# Define the transformation
shifted_transform = transforms.Compose([
                    transforms.RandomAffine(degrees=0, translate=(0.4, 0.4)),
                    transforms.ToTensor(),
                ])

test_shifted_dataset = datasets.MNIST(root='./data', train=False, transform=shifted_transform)
test_shifted_loader = torch.utils.data.DataLoader(test_shifted_dataset, batch_size=128, shuffle=False)

In [None]:
# Get a batch of images from the data loader
images, labels = next(iter(test_shifted_loader))

# Plot the images
fig, axs = plt.subplots(3, 3, figsize=(5, 5))

for i in range(3):
    for j in range(3):
        axs[i, j].imshow(np.squeeze(images[i*8+j]), cmap='gray')
        axs[i, j].axis('off')

plt.show()

In [None]:
neural_net.eval()
correct, total = 0, 0
with torch.no_grad():
    for data, targets in test_shifted_loader:
        data = data.to(device)
        targets = targets.to(device)
        neural_net_predictions = neural_net(data)
        _, predicted = torch.max(neural_net_predictions.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print("test accuracy = ",correct/total)

In [None]:
conv_net.eval()
correct, total = 0, 0
with torch.no_grad():
    for data, targets in test_shifted_loader:
        data = data.to(device)
        targets = targets.to(device)
        conv_net_predictions = conv_net(data)
        _, predicted = torch.max(conv_net_predictions.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print("test accuracy = ",correct/total)

"### FILL IN ###" [5 POINTS]

Report final performances of NN and CNN on basic MNIST test data. 

Answer: 

<hr>

"### FILL IN ###" [5 POINTS]

Which one of the two models is better on shifted MNIST test data [2 POINTS] and why [3 POINTS] ?


Answer: 

<hr>

"### FILL IN ###" [5 POINTS]

We noted that the performance dropped when the images are shifted. In order to make the model more robust can we train the model on both the original and shifted images ?

Answer: 

<hr>


# **PART B: RNN [30 POINTS]**

**Q3: RNN vs LSTM [30 POINTS]**


In this example, we first load the Penn Treebank dataset using the imdb dataset from Keras. We then pad the sequences to a fixed length of 100 and build two models, one with a SimpleRNN layer and the other with an LSTM layer. 

We then train both models for 10 epochs using the RMSprop optimizer and binary cross-entropy loss function. After training, we evaluate the performance of both models on the test set using the evaluate method.


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, SimpleRNN, Dense, Embedding
from tensorflow.keras.models import Sequential
tf.set_random_seed(1234)
# Load the Penn Treebank dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=10000)

In [None]:
# Get the word index dictionary
word_index = tf.keras.datasets.imdb.get_word_index()

# Reverse the word index dictionary
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

Having a look at some samples

In [None]:
### 4th record in training data (integers are word indices)

print(x_train[4])

In [None]:
reverse_word_index[10]

In [None]:
# Decode the review text for this record
review_text = ' '.join([reverse_word_index.get(i - 3, '?') for i in x_train[4]])
review_text

In [None]:
len(review_text.split()), len(x_train[4])

In [None]:
### corresponding label for 4th record in training data (0=negative,1=positive for movie reviews)
y_train[4]

In [None]:
### looking at a positive moview review 
y_train[400]

In [None]:
review_text = ' '.join([reverse_word_index.get(i - 3, '?') for i in x_train[400]])
review_text

In [None]:
# Pad the sequences to a fixed length
max_len = 100
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_len)

In [None]:
# Build a SimpleRNN model
# Use an embedding layer with an output dimension of 128
# Use a single output layer with a sigmoid activation function
# Add optimizer and loss as mentioned above

### FILL IN ### [5 POINTS]
model_rnn = 
model_rnn.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the SimpleRNN model
model_rnn.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

In [None]:
# Build an LSTM model
# Use an embedding layer with an output dimension of 128
# Use a single output layer with a sigmoid activation function
# Add optimizer and loss as mentioned above

### FILL IN ### [5 POINTS]
model_lstm = 
model_lstm.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Train the LSTM model
model_lstm.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

"### FILL IN ###" [5 POINTS]

Report final performances of RNN and LSTM on IMDB validation data. 

Answer: 

<hr>

"### FILL IN ###" [5 POINTS]

Which one of the two models is better on IMDB validation ?[2 POINTS] State one disadvantage of using LSTM over traditional RNN ? [3 POINTS]

Answer: 

<hr>

"### FILL IN ###" [5 POINTS]

What is a Long Short-Term Memory (LSTM) network? [2 POINTS] How does it differ from a traditional RNN? [3 POINTS] 



Answer: 

<hr>

"### FILL IN ###" [5 POINTS]

What are Seq2Vec [2 POINTS], Vec2Seq [2 POINTS] and Seq2Seq [1 POINTS] models?

Answer: 