In [11]:
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torch.optim as optim
from matplotlib import pyplot as plt

In [12]:
# Download CIFAR 10 dataset for training and validation purposes and apply the following changes on each image:
data_path = '../data-unversioned/p1ch7/'
# 1) make it a tensor
cifar10 = datasets.CIFAR10(data_path, train=True, download=True)
cifar10_val = datasets.CIFAR10(data_path, train=False, download=True)

# 2) normalize it based on the mean and standard deviation among all pixels in each channel (RGB).
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2470, 0.2435, 0.2616])

cifar10 = datasets.CIFAR10(
    data_path, 
    train=True, 
    download=False,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2470, 0.2435, 0.2616])
]))

cifar10_val = datasets.CIFAR10(
    data_path, 
    train=False, 
    download=False,
    transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2470, 0.2435, 0.2616])
]))

# Print the size of training and validation datasets
print("Size of cifar10 dataset", len(cifar10))
print("Size of cifar10 validation dataset", len(cifar10_val))

Files already downloaded and verified
Files already downloaded and verified
Size of cifar10 dataset 50000
Size of cifar10 validation dataset 10000


In [13]:
# We want to make a tertiary classifier that distinguishes between deers, dogs, and horses, labeled as 4, 5, and 7, resp.
label_map = {4: 0, 5: 1, 7: 2}
class_names = ['deers', 'dogs', 'horses']

# Create the subset training and validation datasets for this purpose.
ddh = [(img, label_map[label])
          for img, label in cifar10 if label in label_map.keys()]

ddh_val = [(img, label_map[label])
             for img, label in cifar10_val if label in label_map.keys()]

# Print the size of these datasets.
print("Size of cifar2 training set", len(ddh))
print("Size of cifar2 validation set", len(ddh_val))

Size of cifar2 training set 15000
Size of cifar2 validation set 3000


In [14]:
# Create a parameterized CNN with the following details.
# The parameter is the number of output channels n after the first convolution.

# All kernels are of size 3 by 3.
# Convolutions must not change the height and width.
# Each convolution is followed by hyperbolic tangent as the activation function, and max pooling of size 2 by 2.
# Convolution ayers:
# 1) First convolution layer works on the input RGB input. Let's assume there are n kernels in this layer.
# 2) Second convolution layer works on the result of the preceding max pooling layer. 
#    Let's assume there are n/2 kernels in this layer.
# 3) Third convolution layer works on the result of the preceding max pooling layer. 
#    Let's assume there are n/2 kernels in this layer. 

# Fully connected layers:
# 1) First fully connected layer works on the result of the preceding max pooling layer. 
#    This layer is followed by hyperbolic tangent as its activation function.
# 2) Second fully connected layer works on the result of the preceding activation function, and emits numbers associated
#    with each class.
# We will use negative log likelihood to compute the loss. So you may add additional layer(s) to your network.
# Note: Since the network is parameterized (n), you'd rather define the CNN as a subclass of nn.Module.

class Net(nn.Module):
    def __init__(self, n):
        super().__init__()
        self.n = n
        self.conv1 = nn.Conv2d(
                        3,     # Input Features
                        n,     # Output Features
                        kernel_size = 3, padding=1)
        self.conv2 = nn.Conv2d(n, n//2, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(n//2, n//2, kernel_size=3, padding=1)
        
        self.fc1 = nn.Linear(4 * 4 * n // 2, 32)
        self.fc2 = nn.Linear(32, 3)
    
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = F.max_pool2d(torch.tanh(self.conv3(out)), 2)
        out = out.view(-1, 4 * 4 * (self.n // 2))
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        out = F.log_softmax(out, dim=1)
        return out

In [15]:
# Create two networks as instances of the CNN you defined above, with n = 16 and n = 32 respectively.
# Print the total number of parameters in each of these instances.
net16 = Net(16)
numel_list1 = [p.numel() for p in net16.parameters()]

net32 = Net(32)
numel_list2 = [p.numel() for p in net32.parameters()]

print(f"Num parameters n=16:", sum(numel_list1), numel_list1)
print(f"Num parameters n=32:", sum(numel_list2), numel_list2)

Num parameters n=16: 6419 [432, 16, 1152, 8, 576, 8, 4096, 32, 96, 3]
Num parameters n=32: 16163 [864, 32, 4608, 16, 2304, 16, 8192, 32, 96, 3]


In [16]:
# Our training functionality is supposed to compute gradient on batches of training data, randomly selected each time.
# To this end, create a training data loader with batch size 32 that randomizes access to each batch.
train_loader = torch.utils.data.DataLoader(
    ddh,
    batch_size=32,
    shuffle=True
)

# Also, create a validation data loader with the same batch size that does not randomize access to each batch (no need!)
val_loader = torch.utils.data.DataLoader(
    ddh_val,
    batch_size=32,
    shuffle=False
)

# Print the number of batches in training and validation data loaders
print("Num batches in train_loader:", len(train_loader))
print("Num batches in val_loader:", len(val_loader))

Num batches in train_loader: 469
Num batches in val_loader: 94


In [17]:
# Define your training function that receives the training loader, model, loss function, optimizer, the device (cpu/gpu), and 
# number of epochs.
# In each epoch, you should go through each training data batch, and:
# 1) move data to device
# 1) compute the output batch, and accordingly the loss
# 2) compute the gradient of loss wrt parameters, and update the parameters
#After covering all epochs, your training function must report the training accuracy

device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for imgs, labels in train_loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
            outputs = model(imgs)
            loss = loss_fn(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()            
            optimizer.step()
            
            loss_train += loss.item()
            
        if epoch == 1 or epoch % 10 == 0:
            print('{} Epoch {}, Training loss {}'.format(
                datetime.datetime.now(), epoch,
                loss_train / len(train_loader)))


In [18]:
#Define a separate function that receives the validation data loader as well as the model and computes the validation 
# accuracy of the model.

def validate(model, train_loader, val_loader):
    for name, loader in [("train", train_loader), ("val", val_loader)]:
        correct = 0
        total = 0
        with torch.no_grad():
            for imgs, labels in loader:
                outputs = model(imgs)
                _, predicted = torch.max(outputs, dim=1)
                total += labels.shape[0]
                correct += int((predicted == labels).sum())
        print("Accuracy {}: {:.6f}".format(name, correct / total))                

In [15]:
#Define device dynamically based on whether CUDA is available or not.
#Call the training function on the created training data loader, the created CNN  with n = 16, 
# negative log likelihood loss function, stochastic gradient descent optimizer,
training_loop(
    n_epochs=100,
    optimizer=optim.SGD(net16.parameters(), lr=1e-2),
    model=net16.to(device),
    loss_fn=nn.NLLLoss(),
    train_loader=train_loader
)

# the device you defined, and 100 epochs. Next, call validation accuracy function.
validate(net16, train_loader, val_loader)

# Is the model overfit? (Yes/No) Why?
""" 
Yes the model is overfitting. This is because the training accuracy is higher than
the validation accuracy. This is the case as overfitting occurs when the model 'memorizes'
the training data, so it can't account for new data as well as it should.
"""

2022-11-01 11:41:41.845144 Epoch 1, Training loss 0.654270358939669
2022-11-01 11:42:33.092243 Epoch 10, Training loss 0.5569176901060381
2022-11-01 11:43:30.546001 Epoch 20, Training loss 0.49230227748086963
2022-11-01 11:44:27.927335 Epoch 30, Training loss 0.44867011641007243
2022-11-01 11:45:25.950282 Epoch 40, Training loss 0.41881244697931735
2022-11-01 11:46:23.704218 Epoch 50, Training loss 0.3925727288415437
2022-11-01 11:47:21.887784 Epoch 60, Training loss 0.37409245538940306
2022-11-01 11:48:19.446432 Epoch 70, Training loss 0.3541500512629684
2022-11-01 11:49:17.856360 Epoch 80, Training loss 0.3369169139595174
2022-11-01 11:50:15.528003 Epoch 90, Training loss 0.3231539800445408
2022-11-01 11:51:13.214216 Epoch 100, Training loss 0.30514395502266856
Accuracy train: 0.88
Accuracy val: 0.79


In [11]:
#Call the training function on the created training data loader, the created CNN  with n = 32, 
# negative log likelihood loss function, stochastic gradient descent optimizer,
# the device you defined, and 100 epochs. Next, call validation accuracy function.
training_loop(
    n_epochs=100,
    optimizer=optim.SGD(net32.parameters(), lr=1e-2),
    model=net32.to(device=device),
    loss_fn=nn.NLLLoss(),
    train_loader=train_loader
)
validate(net32, train_loader, val_loader)
#Is the model overfit? (Yes/No) Why? 
# (This can be compared to the fully connected network we created in the last set of exercises.)
""" Yes the model is overfitting since training accuracy > validation accuracy """

2022-11-02 21:56:52.520799 Epoch 1, Training loss 0.9502991778509957
2022-11-02 21:58:14.746871 Epoch 10, Training loss 0.5868856079542815
2022-11-02 21:59:47.706872 Epoch 20, Training loss 0.43791962750176633
2022-11-02 22:01:20.563949 Epoch 30, Training loss 0.35799328086854043
2022-11-02 22:02:51.652702 Epoch 40, Training loss 0.29481443406136304
2022-11-02 22:04:22.759750 Epoch 50, Training loss 0.24518273811318728
2022-11-02 22:05:54.407728 Epoch 60, Training loss 0.20082846441979346
2022-11-02 22:07:25.540392 Epoch 70, Training loss 0.1589053879255679
2022-11-02 22:08:49.253886 Epoch 80, Training loss 0.12469892978850904
2022-11-02 22:10:12.940467 Epoch 90, Training loss 0.09698326696655643
2022-11-02 22:11:36.563118 Epoch 100, Training loss 0.06857202563887593
Accuracy train: 0.979067
Accuracy val: 0.791333


In [12]:
#Next, let's consider L2 regularization with weight decay 0.002 for CNN with n = 32. 
net32_2 = Net(32)
training_loop(
    n_epochs=100,
    optimizer=optim.SGD(net32_2.parameters(), lr=1e-2, weight_decay=0.002),
    model=net32_2.to(device),
    loss_fn=nn.NLLLoss(),
    train_loader=train_loader
)
validate(net32_2, train_loader, val_loader)
# Is the model overfit? (Yes/No) Why?
""" Yes the model is overfitting since training accuracy > validation accuracy """

2022-11-02 22:11:48.869234 Epoch 1, Training loss 1.0569783700808788
2022-11-02 22:13:04.833227 Epoch 10, Training loss 0.6066836116791788
2022-11-02 22:14:28.948842 Epoch 20, Training loss 0.45806187238774576
2022-11-02 22:15:55.198303 Epoch 30, Training loss 0.381322596341308
2022-11-02 22:17:21.793784 Epoch 40, Training loss 0.3320629875034666
2022-11-02 22:18:48.430035 Epoch 50, Training loss 0.2860628360115897
2022-11-02 22:20:15.688400 Epoch 60, Training loss 0.24858584910281686
2022-11-02 22:21:42.692607 Epoch 70, Training loss 0.21554245650450557
2022-11-02 22:23:10.367880 Epoch 80, Training loss 0.18598151354313786
2022-11-02 22:24:38.396354 Epoch 90, Training loss 0.1636426716105643
2022-11-02 22:26:06.282527 Epoch 100, Training loss 0.14222606266739526
Accuracy train: 0.951467
Accuracy val: 0.797000


' Yes the model is overfitting since training accuracy > validation accuracy '

In [13]:
#Add a skip connection in your CNN from the output of second max pooling to the input of 3rd max pooling.

class ResNet(nn.Module): # Book did resnet
    def __init__(self, n):
        super().__init__()
        self.n = n
        self.conv1 = nn.Conv2d(
                        3,     # Input Features
                        n,     # Output Features
                        kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(n, n//2, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(n//2, n//2, kernel_size=3, padding=1)
        
        self.fc1 = nn.Linear(4 * 4 * n // 2, 32)
        self.fc2 = nn.Linear(32, 3)
    
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        skip_connection = out
        out = F.max_pool2d(torch.tanh(self.conv3(out)) + skip_connection, 2)
        out = out.view(-1, 4 * 4 * (self.n // 2))
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        out = F.log_softmax(out, dim=1)
        return out
    
#Train the updated CNN with the same parameters including (n = 32).
net32_skip_connection = ResNet(32)
training_loop(
    n_epochs=100,
    optimizer=optim.SGD(net32_skip_connection.parameters(), lr=1e-2, weight_decay=0.002),
    model=net32_skip_connection.to(device),
    loss_fn=nn.NLLLoss(),
    train_loader=train_loader
)
validate(net32_skip_connection, train_loader, val_loader)

#Is the model overfit? (Yes/No) Why?
""" Yes the model is overfitting since training accuracy > validation accuracy """

2022-11-02 22:27:19.979009 Epoch 1, Training loss 1.0131968244560745
2022-11-02 22:28:38.544993 Epoch 10, Training loss 0.5390585403579639
2022-11-02 22:30:07.462849 Epoch 20, Training loss 0.41773005338238756
2022-11-02 22:31:36.063490 Epoch 30, Training loss 0.34915744592703735
2022-11-02 22:33:04.987188 Epoch 40, Training loss 0.29912021371728575
2022-11-02 22:34:33.860877 Epoch 50, Training loss 0.2578638298615718
2022-11-02 22:36:03.176691 Epoch 60, Training loss 0.22210906704923492
2022-11-02 22:37:32.796424 Epoch 70, Training loss 0.19374022058554805
2022-11-02 22:39:02.834254 Epoch 80, Training loss 0.16686274939730986
2022-11-02 22:40:32.633224 Epoch 90, Training loss 0.14496466477336026
2022-11-02 22:42:02.775554 Epoch 100, Training loss 0.12289860671453638
Accuracy train: 0.970533
Accuracy val: 0.827333


In [22]:
#Consider dropout layers after each max pooling in the original CNN, where the probability of zeroing output features is 30%.
class NetDropout(nn.Module): # Book did NetDropout
    def __init__(self, n):
        super().__init__()
        self.n = n
        self.conv1 = nn.Conv2d(
                        3,     # Input Features
                        n,     # Output Features
                        kernel_size=3, padding=1)
        self.conv1_dropout = nn.Dropout2d(p=0.3)
        self.conv2 = nn.Conv2d(n, n//2, kernel_size=3, padding=1)
        self.conv2_dropout = nn.Dropout2d(p=0.3)
        self.conv3 = nn.Conv2d(n//2, n//2, kernel_size=3, padding=1)
        self.conv3_dropout = nn.Dropout2d(p=0.3)
        
        self.fc1 = nn.Linear(4 * 4 * n // 2, 32)
        self.fc2 = nn.Linear(32, 3)
    
    def forward(self, x):
        out = F.max_pool2d(torch.tanh(self.conv1(x)), 2)
        out = self.conv1_dropout(out)
        
        out = F.max_pool2d(torch.tanh(self.conv2(out)), 2)
        out = self.conv2_dropout(out)
        
        out = F.max_pool2d(torch.tanh(self.conv3(out)), 2)
        out = self.conv3_dropout(out)
        
        out = out.view(-1, 4 * 4 * (self.n // 2))
        out = torch.tanh(self.fc1(out))
        out = self.fc2(out)
        out = F.log_softmax(out, dim=1)
        
#Train the updated CNN with the same parameters including (n = 32).
net32_drop_out = NetDropout(32)
training_loop(
    n_epochs=100,
    optimizer=optim.SGD(net32_drop_out.parameters(), lr=1e-2),
    model=net32_drop_out.to(device),
    loss_fn=nn.NLLLoss(),
    train_loader=train_loader
)
validate(net32_drop_out, train_loader, val_loader)
#Is the model overfit? (Yes/No) Why?
""" I literally don't know how to fix the error this function is throwing, its an internal error """

TypeError: nll_loss_nd(): argument 'input' (position 1) must be Tensor, not NoneType

In [23]:
#Considering all the modifications which one works better? Plain CNN, CNN+L2, CNN+Skip, CNN+Dropout?
""" 
Since I can only compare the first three, because theres an internal error happening in DropOut..
I'm going to guess that a CNN with Skip Layer is the best amongst the three tested
This is because it has the highest trianing & validation accuracies, with relatively lower
overfitting.
"""