In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# FOLDERNAME = 'CS231n_project/'
# assert FOLDERNAME is not None, "[!] Enter the foldername."

# import sys
# sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

# %cd /content/drive/My\ Drive/$FOLDERNAME

# %load_ext autoreload
# %autoreload 2

In [2]:
# !pip install torch==1.7 torchvision==0.8

In [3]:
# %cd approx/src/pytorch/cpp
# !python setup.py install
# %cd ../../../..

In [4]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from conv_norm import PreConv

import torchvision.datasets as dset
import torchvision.transforms as T
import torch.nn.functional as F

import numpy as np
from timeit import default_timer as timer
from utils import ImportanceSampler

USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('using device:', device)

using device: cuda


In [5]:
from utils import get_accuracy, load_dataset
from models import get_model
check_accuracy = lambda loader, model: get_accuracy(loader, model, device, dtype)

In [6]:
def train_model(model_name, dataset_name, model_params={}, hyperparams={}):

  learning_rate = hyperparams.get('lr', 1e-3)
  num_epochs = hyperparams.get('num_epochs', 10)
  weight_decay = hyperparams.get('weight_decay', 0)
  train_ratio = hyperparams.get('train_ratio', 0.8)
  batch_size = hyperparams.get('batch_size', 64)
  seed = hyperparams.get('seed', 0)
  imp_sampling = model_params.get('importance_sampling', False)
  gamma = model_params.get('gamma', 0.9)

  torch.manual_seed(seed)
  np.random.seed(seed)

  loader_train, loader_val, loader_test, num_train, num_channels = load_dataset(dataset_name, train_ratio, batch_size)
  model = get_model(model_name, model_params, learning_rate, loader_train, num_channels, device)

  print("Model architecture:")
  print(model)

  print(f'INFO: Training {model_name} on {dataset_name} with lr {learning_rate}, num_epochs={num_epochs}, weight_decay={weight_decay}')

  optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0)

  epoch_vals = []
  
  weight = torch.tensor([1.0]*num_train)

  t_acc, t_loss = check_accuracy(loader_train, model)
  val_acc, val_loss = check_accuracy(loader_val, model)
  
  start = timer()
  c_time = timer()-start

  print(f'Plot: Train, {0}, {t_loss:.3f}, {t_acc:.2f}, {c_time:.1f}')
  print(f'Plot: Val, {0}, {val_loss:.3f}, {val_acc:.2f}, {c_time:.1f}')

  for e in range(num_epochs):
    model.train()
    doUniform = (e == 0) or (imp_sampling == False)
    loader_train_sampled = loader_train
    if not doUniform:
      train_sampler = ImportanceSampler(num_train, weight, batch_size)
      loader_train_sampled, _, _, _, _ = load_dataset(dataset_name, train_ratio, batch_size, train_sampler)
    
    for t, tpl in enumerate(loader_train_sampled):
        torch.cuda.empty_cache()
        model.train()  # put model to training mode
        x = tpl[0].to(device=device, dtype=dtype)  # move to device, e.g. GPU
        y = tpl[1].to(device=device, dtype=torch.long)

        scores = model(x)
        loss = F.cross_entropy(scores, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if not doUniform:
          idx = tpl[2]
          weight[idx] = gamma * weight[idx] + (1 - gamma) * float(loss)

    t_acc, t_loss = check_accuracy(loader_train, model)
    model.eval()
    val_acc, val_loss = check_accuracy(loader_val, model)
    c_time = timer()-start

    print(f'Plot: Train, {e+1}, {t_loss:.3f}, {t_acc:.2f}, {c_time:.1f}')
    print(f'Plot: Val, {e+1}, {val_loss:.3f}, {val_acc:.2f}, {c_time:.1f}')

  test_acc, test_loss = check_accuracy(loader_test, model)
  print(f'Plot: Test, {val_loss:.3f}, {val_acc:.2f}, {c_time:.1f}')

  return model

In [None]:
gradinit_params = {
    "gradinit_iters": 200,
    "gradinit_alg": "adam", #sgd
    "gradinit_lr": 1e-2,
    "gradinit_grad_clip": 1,
}
model_params = {
    "gradinit": gradinit_params,
    # "convnorm" : True,
    # "approx_mult" : 0.2,
    # "importance_sampling" : True,
    # "gamma" : 0.9
}
hyperparams = {
    "lr" : 3e-3,
    "num_epochs" : 25,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 1024,
}

def test_setup():
  for lr in [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]:
    gradinit_params['gradinit_lr'] = lr
    model_params = {
        "gradinit": gradinit_params,
    }
    train_model('Resnet18', 'CIFAR100', model_params, hyperparams)

test_setup()

In [None]:
gradinit_params = {
    "gradinit_iters": 200,
    "gradinit_alg": "adam", #sgd
    "gradinit_lr": 1e-4, 
    "gradinit_grad_clip": 1,
}
model_params = {
    "gradinit": gradinit_params,
    "convnorm" : {"mode_conv": [('first_frac', 0.25)], "mode_bn": [('first_frac', 0.25)]},
    "approx_mult" : {'mult_val' : 0.8, 'mode_linear' : [('last_num', 1)], 'mode_conv' : [('last_num', 1)]},
}
hyperparams = {
    "lr" : 3e-3,
    "num_epochs" : 25,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 512,
}

def test_setup():
    train_model('Resnet18', 'CIFAR100', model_params, hyperparams)

test_setup()

In [None]:
#TODO

In [8]:
gradinit_params = {
    "gradinit_iters": 200,
    "gradinit_alg": "adam", #sgd
    "gradinit_lr": 1e-4, 
    "gradinit_grad_clip": 1,
}
model_params = {
#     "gradinit": gradinit_params,
#     "convnorm" : {"mode_conv": [('first_frac', 0.25)], "mode_bn": [('first_frac', 0.25)]},
#     "approx_mult" : {'mult_val' : 0.8, 'mode_linear' : [('last_num', 1)], 'mode_conv' : [('last_num', 1)]},
}
hyperparams = {
    "lr" : 3e-3,
    "num_epochs" : 25,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 1024,
}

def test_setup():
    train_model('Resnet18', 'CIFAR100', model_params, hyperparams)

test_setup()

Files already downloaded and verified
CIFAR100 Train dataset raw mean: 0.4783550798892975, raw std dev: 0.2678655982017517
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
INFO: Size of dataset: Training 40000, Validation 10000, Test 10000
Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): Bat

In [9]:
gradinit_params = {
    "gradinit_iters": 200,
    "gradinit_alg": "adam", #sgd
    "gradinit_lr": 1e-4, 
    "gradinit_grad_clip": 1,
}
model_params = {
#     "gradinit": gradinit_params,
#     "convnorm" : {"mode_conv": [('first_frac', 0.25)], "mode_bn": [('first_frac', 0.25)]},
    "approx_mult" : {'mult_val' : 0.8, 'mode_linear' : [('last_num', 1)], 'mode_conv' : [('last_num', 1)]},
}
hyperparams = {
    "lr" : 3e-3,
    "num_epochs" : 25,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 1024,
}

def test_setup():
    for mult_val in [0.2, 0.4, 0.6, 0.8, 1.0][::-1]:
        model_params['approx_mult']['mult_val'] = mult_val
        train_model('Resnet18', 'CIFAR100', model_params, hyperparams)

test_setup()

Files already downloaded and verified
CIFAR100 Train dataset raw mean: 0.4783550798892975, raw std dev: 0.2678655982017517
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
INFO: Size of dataset: Training 40000, Validation 10000, Test 10000
bn_pres is True
Originally num layers were 1
Replace layers list is: [0]
Number of og layers are 0, number of new layers are 1
bn_pres is True
Originally num layers were 20
Replace layers list is: [19]
Number of og layers are 19, number of new layers are 1
Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), paddin

Plot: Train, 0, 6.941, 0.00, 0.0
Plot: Val, 0, 6.940, 0.00, 0.0
Plot: Train, 1, 4.339, 14.35, 36.6
Plot: Val, 1, 3.967, 12.74, 36.6
Plot: Train, 2, 3.238, 22.61, 73.0
Plot: Val, 2, 3.377, 19.31, 73.0
Plot: Train, 3, 3.534, 22.38, 109.4
Plot: Val, 3, 3.533, 19.84, 109.4
Plot: Train, 4, 2.443, 34.80, 145.8
Plot: Val, 4, 2.993, 27.70, 145.8
Plot: Train, 5, 2.610, 35.98, 182.0
Plot: Val, 5, 3.085, 28.12, 182.0
Plot: Train, 6, 1.748, 46.93, 218.2
Plot: Val, 6, 2.702, 33.71, 218.2
Plot: Train, 7, 2.348, 43.50, 254.4
Plot: Val, 7, 3.228, 30.88, 254.4
Plot: Train, 8, 1.441, 56.43, 291.0
Plot: Val, 8, 2.623, 35.00, 291.0
Plot: Train, 9, 1.196, 61.78, 327.9
Plot: Val, 9, 2.746, 36.40, 327.9
Plot: Train, 10, 1.223, 62.86, 365.2
Plot: Val, 10, 2.960, 34.89, 365.2
Plot: Train, 11, 1.360, 62.82, 402.9
Plot: Val, 11, 3.204, 32.62, 402.9
Plot: Train, 12, 0.807, 78.17, 441.6
Plot: Val, 12, 2.843, 37.38, 441.6
Plot: Train, 13, 0.502, 84.14, 481.4
Plot: Val, 13, 2.995, 38.75, 481.4
Plot: Train, 14, 0.534

Plot: Train, 15, 0.362, 87.33, 569.9
Plot: Val, 15, 3.325, 38.11, 569.9
Plot: Train, 16, 0.204, 93.70, 608.3
Plot: Val, 16, 3.603, 38.76, 608.3
Plot: Train, 17, 0.141, 91.76, 646.7
Plot: Val, 17, 3.739, 37.34, 646.7
Plot: Train, 18, 0.232, 93.55, 685.1
Plot: Val, 18, 4.276, 38.74, 685.1
Plot: Train, 19, 0.260, 92.75, 723.4
Plot: Val, 19, 4.014, 38.51, 723.4
Plot: Train, 20, 0.189, 94.26, 761.6
Plot: Val, 20, 4.071, 39.16, 761.6
Plot: Train, 21, 0.157, 96.11, 799.8
Plot: Val, 21, 4.001, 39.13, 799.8
Plot: Train, 22, 0.069, 94.69, 837.9
Plot: Val, 22, 3.852, 39.14, 837.9
Plot: Train, 23, 0.181, 95.81, 876.1
Plot: Val, 23, 4.241, 38.97, 876.1
Plot: Train, 24, 0.015, 98.28, 914.2
Plot: Val, 24, 4.199, 40.29, 914.2
Plot: Train, 25, 0.071, 98.16, 952.4
Plot: Val, 25, 3.938, 40.25, 952.4
Plot: Test, 3.938, 40.25, 952.4
Files already downloaded and verified
CIFAR100 Train dataset raw mean: 0.4783550798892975, raw std dev: 0.2678655982017517
Files already downloaded and verified
Files already d

Files already downloaded and verified
INFO: Size of dataset: Training 40000, Validation 10000, Test 10000
bn_pres is True
Originally num layers were 1
Replace layers list is: [0]
Number of og layers are 0, number of new layers are 1
bn_pres is True
Originally num layers were 20
Replace layers list is: [19]
Number of og layers are 19, number of new layers are 1
Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), s

In [11]:
gradinit_params = {
    "gradinit_iters": 200,
    "gradinit_alg": "adam", #sgd
    "gradinit_lr": 1e-4, 
    "gradinit_grad_clip": 1,
}
model_params = {
#     "gradinit": gradinit_params,
#     "convnorm" : {"mode_conv": [('first_frac', 0.25)], "mode_bn": [('first_frac', 0.25)]},
#     "approx_mult" : {'mult_val' : 0.8, 'mode_linear' : [('last_num', 1)], 'mode_conv' : [('last_num', 1)]},
}
hyperparams = {
    "lr" : 3e-3,
    "num_epochs" : 25,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 512,
}

def test_setup():
    train_model('Resnet18', 'CIFAR100', model_params, hyperparams)

test_setup()

Files already downloaded and verified
CIFAR100 Train dataset raw mean: 0.4783550798892975, raw std dev: 0.2678655982017517
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
INFO: Size of dataset: Training 40000, Validation 10000, Test 10000
Model architecture:
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): Bat