In [13]:
!pip install git+https://github.com/yfw/starter-code

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/yfw/starter-code
  Cloning https://github.com/yfw/starter-code to /tmp/pip-req-build-ku5jwji1
  Running command git clone --filter=blob:none --quiet https://github.com/yfw/starter-code /tmp/pip-req-build-ku5jwji1
  Resolved https://github.com/yfw/starter-code to commit 9a6269738ba0094d0bd09aea4cc21747b965c6d1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [14]:
import coutils
from coutils import fix_random_seed

from collections import OrderedDict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler

import torchvision.datasets as dset
import torchvision.transforms as T

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [15]:
NUM_TRAIN = 49000

# The torchvision.transforms package provides tools for preprocessing data
# and for performing data augmentation; here we set up a transform to
# preprocess the data by subtracting the mean RGB value and dividing by the
# standard deviation of each RGB value; we've hardcoded the mean and std.
transform = T.Compose([
                T.ToTensor(),
                T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
            ])

# We set up a Dataset object for each split (train / val / test); Datasets load
# training examples one at a time, so we wrap each Dataset in a DataLoader which
# iterates through the Dataset and forms minibatches. We divide the CIFAR-10
# training set into train and val sets by passing a Sampler object to the
# DataLoader telling how it should sample from the underlying Dataset.
cifar10_train = dset.CIFAR10('./datasets', train=True, download=True,
                             transform=transform)
loader_train = DataLoader(cifar10_train, batch_size=64, 
                          sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN)))

cifar10_val = dset.CIFAR10('./datasets', train=True, download=True,
                           transform=transform)
loader_val = DataLoader(cifar10_val, batch_size=64, 
                        sampler=sampler.SubsetRandomSampler(range(NUM_TRAIN, 50000)))

cifar10_test = dset.CIFAR10('./datasets', train=False, download=True, 
                            transform=transform)
loader_test = DataLoader(cifar10_test, batch_size=64)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [16]:
dtype = torch.float
ltype = torch.long

if torch.cuda.is_available():
  device = torch.device('cuda:0')
else:
  device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 100

print('using device:', device)

using device: cuda:0


In [17]:
def flatten(x, start_dim=1, end_dim=-1):
  return x.flatten(start_dim=start_dim, end_dim=end_dim)

In [18]:
class Flatten(nn.Module):
  def forward(self, x):
    return flatten(x)

In [19]:
def check_accuracy_part34(loader, model):
  if loader.dataset.train:
    print('Checking accuracy on validation set')
  else:
    print('Checking accuracy on test set')   
  num_correct = 0
  num_samples = 0
  model.eval()  # set model to evaluation mode
  with torch.no_grad():
    for x, y in loader:
      x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
      y = y.to(device=device, dtype=ltype)
      scores = model(x)
      _, preds = scores.max(1)
      num_correct += (preds == y).sum()
      num_samples += preds.size(0)
    acc = float(num_correct) / num_samples
    print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
  return acc

In [20]:
def train_part(model, optimizer, epochs=1):
    """
    Train a model on CIFAR-10 using the PyTorch Module API.
    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Accuracy, also prints model accuracies during training.
    """
    model = model.to(device=device)  # move the model parameters to CPU/GPU
    acc = 0
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)

            # Zero out all of the gradients for the variables which the optimizer
            # will update.
            optimizer.zero_grad()

            # This is the backwards pass: compute the gradient of the loss with
            # respect to each  parameter of the model.
            loss.backward()

            # Actually update the parameters of the model using the gradients
            # computed by the backwards pass.
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                acc = check_accuracy_part34(loader_val, model)
                print()
    return acc

Here we train a model using SGD without Nesterov momentum. 

In [21]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 16
channel_2 = 32
channel_3 = 64

kernel_size_1 = 3
kernel_size_2 = 3
kernel_size_3 = 6

pad_size_1 = 2
pad_size_2 = 1
pad_size_3 = 1

learning_rate = 1e-2
momentum = 0.5
weight_decay = 1e-4

# my own added parameters
pool_kernel_size = 2

# calculate shape between layers
H_after_conv1 = int(H + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
W_after_conv1 = int(W + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
H_after_pool1 = int((H_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool1 = int((W_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv2 = int(H_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
W_after_conv2 = int(W_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
H_after_pool2 = int((H_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool2 = int((W_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv3 = int(H_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
W_after_conv3 = int(W_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
H_after_pool3 = int((H_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool3 = int((W_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

model1 = nn.Sequential(OrderedDict([
  ('conv1', nn.Conv2d(C, channel_1, (kernel_size_1,kernel_size_1), padding=pad_size_1)),
  ('batchnorm1', nn.BatchNorm2d(channel_1)),
  ('relu1', nn.ReLU()),
  #('dropout1', nn.Dropout2d()),
  ('maxpool1', nn.MaxPool2d(pool_kernel_size)),

  ('conv2', nn.Conv2d(channel_1, channel_2, (kernel_size_2,kernel_size_2), padding=pad_size_2)),
  ('batchnorm2', nn.BatchNorm2d(channel_2)),
  ('relu2', nn.ReLU()),
  #('dropout2', nn.Dropout2d()),
  ('maxpool2', nn.MaxPool2d(pool_kernel_size)),

  ('conv3', nn.Conv2d(channel_2, channel_3, (kernel_size_3,kernel_size_3), padding=pad_size_3)),
  ('batchnorm3', nn.BatchNorm2d(channel_3)),
  ('relu3', nn.ReLU()),
  #('dropout3', nn.Dropout2d()),
  ('maxpool3', nn.MaxPool2d(pool_kernel_size)),
  
  # # the ending option 1
  # ('avgpool', nn.AvgPool2d((H_after_pool3, W_after_pool3))),
  # ('flatten', Flatten()),
  # ('fc', nn.Linear(1*1*channel_3, num_classes)),
  
  # the ending option 2
  ('flatten', Flatten()),
  ('fc', nn.Linear(H_after_pool3*W_after_pool3*channel_3, num_classes)),
]))

optimizer1 = optim.SGD(model1.parameters(), lr=learning_rate, 
                      weight_decay=weight_decay,
                      momentum=0, nesterov=False)
# END OF YOUR CODE

# You should get at least 70% accuracy
train_part(model1, optimizer1, epochs=10)

Iteration 0, loss = 2.3975
Checking accuracy on validation set
Got 98 / 1000 correct (9.80)

Iteration 100, loss = 1.8358
Checking accuracy on validation set
Got 420 / 1000 correct (42.00)

Iteration 200, loss = 1.5136
Checking accuracy on validation set
Got 484 / 1000 correct (48.40)

Iteration 300, loss = 1.2554
Checking accuracy on validation set
Got 506 / 1000 correct (50.60)

Iteration 400, loss = 1.3212
Checking accuracy on validation set
Got 522 / 1000 correct (52.20)

Iteration 500, loss = 1.1832
Checking accuracy on validation set
Got 533 / 1000 correct (53.30)

Iteration 600, loss = 1.0599
Checking accuracy on validation set
Got 553 / 1000 correct (55.30)

Iteration 700, loss = 1.4052
Checking accuracy on validation set
Got 568 / 1000 correct (56.80)

Iteration 0, loss = 0.9499
Checking accuracy on validation set
Got 602 / 1000 correct (60.20)

Iteration 100, loss = 1.3063
Checking accuracy on validation set
Got 602 / 1000 correct (60.20)

Iteration 200, loss = 1.2288
Checkin

0.742

Next we train a model with Nesterov momentum. 

In [22]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 16
channel_2 = 32
channel_3 = 64

kernel_size_1 = 3
kernel_size_2 = 3
kernel_size_3 = 6

pad_size_1 = 2
pad_size_2 = 1
pad_size_3 = 1

learning_rate = 1e-2
momentum = 0.5
weight_decay = 1e-4

# my own added parameters
pool_kernel_size = 2

# calculate shape between layers
H_after_conv1 = int(H + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
W_after_conv1 = int(W + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
H_after_pool1 = int((H_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool1 = int((W_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv2 = int(H_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
W_after_conv2 = int(W_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
H_after_pool2 = int((H_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool2 = int((W_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv3 = int(H_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
W_after_conv3 = int(W_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
H_after_pool3 = int((H_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool3 = int((W_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

model2 = nn.Sequential(OrderedDict([
  ('conv1', nn.Conv2d(C, channel_1, (kernel_size_1,kernel_size_1), padding=pad_size_1)),
  ('batchnorm1', nn.BatchNorm2d(channel_1)),
  ('relu1', nn.ReLU()),
  #('dropout1', nn.Dropout2d()),
  ('maxpool1', nn.MaxPool2d(pool_kernel_size)),

  ('conv2', nn.Conv2d(channel_1, channel_2, (kernel_size_2,kernel_size_2), padding=pad_size_2)),
  ('batchnorm2', nn.BatchNorm2d(channel_2)),
  ('relu2', nn.ReLU()),
  #('dropout2', nn.Dropout2d()),
  ('maxpool2', nn.MaxPool2d(pool_kernel_size)),

  ('conv3', nn.Conv2d(channel_2, channel_3, (kernel_size_3,kernel_size_3), padding=pad_size_3)),
  ('batchnorm3', nn.BatchNorm2d(channel_3)),
  ('relu3', nn.ReLU()),
  #('dropout3', nn.Dropout2d()),
  ('maxpool3', nn.MaxPool2d(pool_kernel_size)),
  
  # # the ending option 1
  # ('avgpool', nn.AvgPool2d((H_after_pool3, W_after_pool3))),
  # ('flatten', Flatten()),
  # ('fc', nn.Linear(1*1*channel_3, num_classes)),
  
  # the ending option 2
  ('flatten', Flatten()),
  ('fc', nn.Linear(H_after_pool3*W_after_pool3*channel_3, num_classes)),
]))

optimizer2 = optim.SGD(model2.parameters(), lr=learning_rate, 
                      weight_decay=weight_decay,
                      momentum=momentum, nesterov=True)
# END OF YOUR CODE

# You should get at least 70% accuracy
train_part(model2, optimizer2, epochs=10)


Iteration 0, loss = 2.3975
Checking accuracy on validation set
Got 104 / 1000 correct (10.40)

Iteration 100, loss = 1.7141
Checking accuracy on validation set
Got 490 / 1000 correct (49.00)

Iteration 200, loss = 1.3997
Checking accuracy on validation set
Got 540 / 1000 correct (54.00)

Iteration 300, loss = 1.2386
Checking accuracy on validation set
Got 523 / 1000 correct (52.30)

Iteration 400, loss = 1.2174
Checking accuracy on validation set
Got 540 / 1000 correct (54.00)

Iteration 500, loss = 1.0554
Checking accuracy on validation set
Got 553 / 1000 correct (55.30)

Iteration 600, loss = 0.9361
Checking accuracy on validation set
Got 569 / 1000 correct (56.90)

Iteration 700, loss = 1.2736
Checking accuracy on validation set
Got 594 / 1000 correct (59.40)

Iteration 0, loss = 0.8884
Checking accuracy on validation set
Got 618 / 1000 correct (61.80)

Iteration 100, loss = 1.1403
Checking accuracy on validation set
Got 609 / 1000 correct (60.90)

Iteration 200, loss = 1.1661
Check

0.775

Model with adagrad optimizer:

In [28]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 16
channel_2 = 32
channel_3 = 64

kernel_size_1 = 3
kernel_size_2 = 3
kernel_size_3 = 6

pad_size_1 = 2
pad_size_2 = 1
pad_size_3 = 1

learning_rate = 1e-2
momentum = 0.5
weight_decay = 1e-4

# my own added parameters
pool_kernel_size = 2

# calculate shape between layers
H_after_conv1 = int(H + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
W_after_conv1 = int(W + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
H_after_pool1 = int((H_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool1 = int((W_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv2 = int(H_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
W_after_conv2 = int(W_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
H_after_pool2 = int((H_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool2 = int((W_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv3 = int(H_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
W_after_conv3 = int(W_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
H_after_pool3 = int((H_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool3 = int((W_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

model3 = nn.Sequential(OrderedDict([
  ('conv1', nn.Conv2d(C, channel_1, (kernel_size_1,kernel_size_1), padding=pad_size_1)),
  ('batchnorm1', nn.BatchNorm2d(channel_1)),
  ('relu1', nn.ReLU()),
  #('dropout1', nn.Dropout2d()),
  ('maxpool1', nn.MaxPool2d(pool_kernel_size)),

  ('conv2', nn.Conv2d(channel_1, channel_2, (kernel_size_2,kernel_size_2), padding=pad_size_2)),
  ('batchnorm2', nn.BatchNorm2d(channel_2)),
  ('relu2', nn.ReLU()),
  #('dropout2', nn.Dropout2d()),
  ('maxpool2', nn.MaxPool2d(pool_kernel_size)),

  ('conv3', nn.Conv2d(channel_2, channel_3, (kernel_size_3,kernel_size_3), padding=pad_size_3)),
  ('batchnorm3', nn.BatchNorm2d(channel_3)),
  ('relu3', nn.ReLU()),
  #('dropout3', nn.Dropout2d()),
  ('maxpool3', nn.MaxPool2d(pool_kernel_size)),
  
  # # the ending option 1
  # ('avgpool', nn.AvgPool2d((H_after_pool3, W_after_pool3))),
  # ('flatten', Flatten()),
  # ('fc', nn.Linear(1*1*channel_3, num_classes)),
  
  # the ending option 2
  ('flatten', Flatten()),
  ('fc', nn.Linear(H_after_pool3*W_after_pool3*channel_3, num_classes)),
])).cuda()

optimizer3 = optim.Adagrad(model3.parameters(), lr=learning_rate, 
                      weight_decay=weight_decay)
# END OF YOUR CODE

# You should get at least 70% accuracy
train_part(model3, optimizer3, epochs=10)

Iteration 0, loss = 2.3975
Checking accuracy on validation set
Got 113 / 1000 correct (11.30)

Iteration 100, loss = 1.5706
Checking accuracy on validation set
Got 517 / 1000 correct (51.70)

Iteration 200, loss = 1.2788
Checking accuracy on validation set
Got 585 / 1000 correct (58.50)

Iteration 300, loss = 0.9900
Checking accuracy on validation set
Got 610 / 1000 correct (61.00)

Iteration 400, loss = 1.0535
Checking accuracy on validation set
Got 619 / 1000 correct (61.90)

Iteration 500, loss = 0.9415
Checking accuracy on validation set
Got 604 / 1000 correct (60.40)

Iteration 600, loss = 0.8224
Checking accuracy on validation set
Got 613 / 1000 correct (61.30)

Iteration 700, loss = 1.1606
Checking accuracy on validation set
Got 640 / 1000 correct (64.00)

Iteration 0, loss = 0.8358
Checking accuracy on validation set
Got 654 / 1000 correct (65.40)

Iteration 100, loss = 1.0648
Checking accuracy on validation set
Got 664 / 1000 correct (66.40)

Iteration 200, loss = 0.9610
Check

0.731

Model with RMSProp optimizer: 

In [25]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 16
channel_2 = 32
channel_3 = 64

kernel_size_1 = 3
kernel_size_2 = 3
kernel_size_3 = 6

pad_size_1 = 2
pad_size_2 = 1
pad_size_3 = 1

learning_rate = 1e-2
momentum = 0.5
weight_decay = 1e-4

# my own added parameters
pool_kernel_size = 2

# calculate shape between layers
H_after_conv1 = int(H + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
W_after_conv1 = int(W + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
H_after_pool1 = int((H_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool1 = int((W_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv2 = int(H_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
W_after_conv2 = int(W_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
H_after_pool2 = int((H_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool2 = int((W_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv3 = int(H_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
W_after_conv3 = int(W_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
H_after_pool3 = int((H_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool3 = int((W_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

model4 = nn.Sequential(OrderedDict([
  ('conv1', nn.Conv2d(C, channel_1, (kernel_size_1,kernel_size_1), padding=pad_size_1)),
  ('batchnorm1', nn.BatchNorm2d(channel_1)),
  ('relu1', nn.ReLU()),
  #('dropout1', nn.Dropout2d()),
  ('maxpool1', nn.MaxPool2d(pool_kernel_size)),

  ('conv2', nn.Conv2d(channel_1, channel_2, (kernel_size_2,kernel_size_2), padding=pad_size_2)),
  ('batchnorm2', nn.BatchNorm2d(channel_2)),
  ('relu2', nn.ReLU()),
  #('dropout2', nn.Dropout2d()),
  ('maxpool2', nn.MaxPool2d(pool_kernel_size)),

  ('conv3', nn.Conv2d(channel_2, channel_3, (kernel_size_3,kernel_size_3), padding=pad_size_3)),
  ('batchnorm3', nn.BatchNorm2d(channel_3)),
  ('relu3', nn.ReLU()),
  #('dropout3', nn.Dropout2d()),
  ('maxpool3', nn.MaxPool2d(pool_kernel_size)),
  
  # # the ending option 1
  # ('avgpool', nn.AvgPool2d((H_after_pool3, W_after_pool3))),
  # ('flatten', Flatten()),
  # ('fc', nn.Linear(1*1*channel_3, num_classes)),
  
  # the ending option 2
  ('flatten', Flatten()),
  ('fc', nn.Linear(H_after_pool3*W_after_pool3*channel_3, num_classes)),
]))

optimizer4 = optim.RMSprop(model4.parameters(), lr=learning_rate, 
                      weight_decay=weight_decay, momentum=momentum)
# END OF YOUR CODE

# You should get at least 70% accuracy
train_part(model4, optimizer4, epochs=10)

Iteration 0, loss = 2.3975
Checking accuracy on validation set
Got 106 / 1000 correct (10.60)

Iteration 100, loss = 1.8706
Checking accuracy on validation set
Got 401 / 1000 correct (40.10)

Iteration 200, loss = 1.5201
Checking accuracy on validation set
Got 469 / 1000 correct (46.90)

Iteration 300, loss = 1.3443
Checking accuracy on validation set
Got 466 / 1000 correct (46.60)

Iteration 400, loss = 1.4646
Checking accuracy on validation set
Got 530 / 1000 correct (53.00)

Iteration 500, loss = 1.2190
Checking accuracy on validation set
Got 522 / 1000 correct (52.20)

Iteration 600, loss = 1.0290
Checking accuracy on validation set
Got 494 / 1000 correct (49.40)

Iteration 700, loss = 1.4220
Checking accuracy on validation set
Got 571 / 1000 correct (57.10)

Iteration 0, loss = 1.0794
Checking accuracy on validation set
Got 582 / 1000 correct (58.20)

Iteration 100, loss = 1.3459
Checking accuracy on validation set
Got 586 / 1000 correct (58.60)

Iteration 200, loss = 1.2448
Check

0.666

Model with Adam optimizer: 

In [27]:
fix_random_seed(0)

C, H, W = 3, 32, 32
num_classes = 10

channel_1 = 16
channel_2 = 32
channel_3 = 64

kernel_size_1 = 3
kernel_size_2 = 3
kernel_size_3 = 6

pad_size_1 = 2
pad_size_2 = 1
pad_size_3 = 1

learning_rate = 1e-2
momentum = 0.5
weight_decay = 1e-4

# my own added parameters
pool_kernel_size = 2

# calculate shape between layers
H_after_conv1 = int(H + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
W_after_conv1 = int(W + 2*pad_size_1 - 1*(kernel_size_1-1) - 1 + 1)
H_after_pool1 = int((H_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool1 = int((W_after_conv1 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv2 = int(H_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
W_after_conv2 = int(W_after_pool1 + 2*pad_size_2 - 1*(kernel_size_2-1) - 1 + 1)
H_after_pool2 = int((H_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool2 = int((W_after_conv2 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

H_after_conv3 = int(H_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
W_after_conv3 = int(W_after_pool2 + 2*pad_size_3 - 1*(kernel_size_3-1) - 1 + 1)
H_after_pool3 = int((H_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)
W_after_pool3 = int((W_after_conv3 + 2*0 - 1*(pool_kernel_size-1) - 1) / pool_kernel_size + 1)

model5 = nn.Sequential(OrderedDict([
  ('conv1', nn.Conv2d(C, channel_1, (kernel_size_1,kernel_size_1), padding=pad_size_1)),
  ('batchnorm1', nn.BatchNorm2d(channel_1)),
  ('relu1', nn.ReLU()),
  #('dropout1', nn.Dropout2d()),
  ('maxpool1', nn.MaxPool2d(pool_kernel_size)),

  ('conv2', nn.Conv2d(channel_1, channel_2, (kernel_size_2,kernel_size_2), padding=pad_size_2)),
  ('batchnorm2', nn.BatchNorm2d(channel_2)),
  ('relu2', nn.ReLU()),
  #('dropout2', nn.Dropout2d()),
  ('maxpool2', nn.MaxPool2d(pool_kernel_size)),

  ('conv3', nn.Conv2d(channel_2, channel_3, (kernel_size_3,kernel_size_3), padding=pad_size_3)),
  ('batchnorm3', nn.BatchNorm2d(channel_3)),
  ('relu3', nn.ReLU()),
  #('dropout3', nn.Dropout2d()),
  ('maxpool3', nn.MaxPool2d(pool_kernel_size)),
  
  # # the ending option 1
  # ('avgpool', nn.AvgPool2d((H_after_pool3, W_after_pool3))),
  # ('flatten', Flatten()),
  # ('fc', nn.Linear(1*1*channel_3, num_classes)),
  
  # the ending option 2
  ('flatten', Flatten()),
  ('fc', nn.Linear(H_after_pool3*W_after_pool3*channel_3, num_classes)),
]))

optimizer5 = optim.Adam(model5.parameters(), lr=learning_rate, 
                      weight_decay=weight_decay)
# END OF YOUR CODE

# You should get at least 70% accuracy
train_part(model5, optimizer5, epochs=10)

Iteration 0, loss = 2.3975
Checking accuracy on validation set
Got 113 / 1000 correct (11.30)

Iteration 100, loss = 1.8487
Checking accuracy on validation set
Got 465 / 1000 correct (46.50)

Iteration 200, loss = 1.4225
Checking accuracy on validation set
Got 522 / 1000 correct (52.20)

Iteration 300, loss = 1.0528
Checking accuracy on validation set
Got 520 / 1000 correct (52.00)

Iteration 400, loss = 1.1470
Checking accuracy on validation set
Got 501 / 1000 correct (50.10)

Iteration 500, loss = 1.1194
Checking accuracy on validation set
Got 583 / 1000 correct (58.30)

Iteration 600, loss = 0.8535
Checking accuracy on validation set
Got 588 / 1000 correct (58.80)

Iteration 700, loss = 1.3584
Checking accuracy on validation set
Got 635 / 1000 correct (63.50)

Iteration 0, loss = 0.8737
Checking accuracy on validation set
Got 652 / 1000 correct (65.20)

Iteration 100, loss = 0.8278
Checking accuracy on validation set
Got 653 / 1000 correct (65.30)

Iteration 200, loss = 1.0600
Check

0.736

Final results on the test set

In [29]:
acc_final_model1 = check_accuracy_part34(loader_test, model1)

Checking accuracy on test set
Got 7070 / 10000 correct (70.70)


In [30]:
acc_final_model2 = check_accuracy_part34(loader_test, model2)

Checking accuracy on test set
Got 7253 / 10000 correct (72.53)


In [31]:
acc_final_model3 = check_accuracy_part34(loader_test, model3)

Checking accuracy on test set
Got 7327 / 10000 correct (73.27)


In [32]:
acc_final_model4 = check_accuracy_part34(loader_test, model4)

Checking accuracy on test set
Got 6798 / 10000 correct (67.98)


In [33]:
acc_final_model5 = check_accuracy_part34(loader_test, model5)

Checking accuracy on test set
Got 7302 / 10000 correct (73.02)


For this particular architecture, the adagrad optimizer performs the best on the test data. 