In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# FOLDERNAME = 'CS231n_project/'
# assert FOLDERNAME is not None, "[!] Enter the foldername."

# import sys
# sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

# %cd /content/drive/My\ Drive/$FOLDERNAME

# %load_ext autoreload
# %autoreload 2

In [2]:
# !pip install torch==1.7 torchvision==0.8

In [3]:
# %cd approx/src/pytorch/cpp
# !python setup.py install
# %cd ../../../..

In [4]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from conv_norm import PreConv

import torchvision.datasets as dset
import torchvision.transforms as T
import torch.nn.functional as F

import numpy as np
from timeit import default_timer as timer
from utils import ImportanceSampler

USE_GPU = True
dtype = torch.float32 # We will be using float throughout this tutorial.

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('using device:', device)

using device: cuda


In [5]:
from utils import get_accuracy, load_dataset
from models import get_model
check_accuracy = lambda loader, model: get_accuracy(loader, model, device, dtype)

In [6]:
def train_model(model_name, dataset_name, model_params={}, hyperparams={}):

  learning_rate = hyperparams.get('lr', 1e-3)
  num_epochs = hyperparams.get('num_epochs', 10)
  weight_decay = hyperparams.get('weight_decay', 0)
  train_ratio = hyperparams.get('train_ratio', 0.8)
  batch_size = hyperparams.get('batch_size', 64)
  seed = hyperparams.get('seed', 0)
  imp_sampling = model_params.get('importance_sampling', False)
  gamma = model_params.get('gamma', 0.9)

  torch.manual_seed(seed)
  np.random.seed(seed)

  loader_train, loader_val, loader_test, num_train, num_channels = load_dataset(dataset_name, train_ratio, batch_size)
  model = get_model(model_name, model_params, learning_rate, loader_train, num_channels, device)

  print("Model architecture:")
  print(model)

  print(f'INFO: Training {model_name} on {dataset_name} with lr {learning_rate}, num_epochs={num_epochs}, weight_decay={weight_decay}')

  optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0)

  epoch_vals = []
  
  weight = torch.tensor([1.0]*num_train)

  t_acc, t_loss = check_accuracy(loader_train, model)
  val_acc, val_loss = check_accuracy(loader_val, model)
  
  start = timer()
  c_time = timer()-start

  print(f'Plot: Train, {0}, {t_loss:.3f}, {t_acc:.2f}, {c_time:.1f}')
  print(f'Plot: Val, {0}, {val_loss:.3f}, {val_acc:.2f}, {c_time:.1f}')

  for e in range(num_epochs):
    model.train()
    doUniform = (e == 0) or (imp_sampling == False)
    loader_train_sampled = loader_train
    if not doUniform:
      train_sampler = ImportanceSampler(num_train, weight, batch_size)
      loader_train_sampled, _, _, _, _ = load_dataset(dataset_name, train_ratio, batch_size, train_sampler)
    
    for t, tpl in enumerate(loader_train_sampled):
        torch.cuda.empty_cache()
        model.train()  # put model to training mode
        x = tpl[0].to(device=device, dtype=dtype)  # move to device, e.g. GPU
        y = tpl[1].to(device=device, dtype=torch.long)

        scores = model(x)
        loss = F.cross_entropy(scores, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if not doUniform:
          idx = tpl[2]
          weight[idx] = gamma * weight[idx] + (1 - gamma) * float(loss)

    t_acc, t_loss = check_accuracy(loader_train, model)
    model.eval()
    val_acc, val_loss = check_accuracy(loader_val, model)
    c_time = timer()-start

    print(f'Plot: Train, {e+1}, {t_loss:.3f}, {t_acc:.2f}, {c_time:.1f}')
    print(f'Plot: Val, {e+1}, {val_loss:.3f}, {val_acc:.2f}, {c_time:.1f}')

  test_acc, test_loss = check_accuracy(loader_test, model)
  print(f'Plot: Test, {val_loss:.3f}, {val_acc:.2f}, {c_time:.1f}')

  return model

In [7]:
# gradinit_params = {
#     "gradinit_iters": 50,
#     "gradinit_alg": "adam", #sgd
#     "gradinit_lr": 1e-2,
#     "gradinit_grad_clip": 100,
# }
model_params = {
#     "gradinit": gradinit_params,
    "convnorm" : {"mode_conv": [('first_frac', 0.25)], "mode_bn": [('first_frac', 0.25)]},
    # "approx_mult" : 0.2,
    # "importance_sampling" : True,
    # "gamma" : 0.9
}
hyperparams = {
    "lr" : 3e-5,
    "num_epochs" : 75,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 1024,
}

In [8]:
# def test_setup():
#     modeList = [[('last_num', 1)], [('last_frac', 0.25)], [('last_frac', 0.5)], [('last_frac', 0.75)]][1:]
#     for mode in modeList:
#         model_params['convnorm']["mode_conv"] = mode
#         model_params['convnorm']["mode_bn"] = mode
#         train_model('VGG16', 'CIFAR100', model_params, hyperparams)

# test_setup()

In [9]:
# def test_setup():
#     modeList = [[('first_frac', 0.25)]]
#     for mode in modeList:
#         model_params['convnorm']["mode_conv"] = mode
#         model_params['convnorm']["mode_bn"] = mode
#         train_model('VGG16', 'CIFAR100', model_params, hyperparams)

# test_setup()

In [10]:
gradinit_params = {
    "gradinit_iters": 200,
    "gradinit_alg": "adam", #sgd
    "gradinit_lr": 1e-2,
    "gradinit_grad_clip": 1,
}
model_params = {
    "gradinit": gradinit_params,
    # "convnorm" : True,
    # "approx_mult" : 0.2,
    # "importance_sampling" : True,
    # "gamma" : 0.9
}
hyperparams = {
    "lr" : 3e-5,
    "num_epochs" : 60,
    "weight_decay" : 0,
    "train_ratio" : 0.8,
    "batch_size" : 512,
}


def test_setup():
  for lr in [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]:
    gradinit_params['gradinit_lr'] = lr
    model_params = {
        "gradinit": gradinit_params,
    }
    train_model('VGG16', 'CIFAR100', model_params, hyperparams)

test_setup()

Files already downloaded and verified
CIFAR100 Train dataset raw mean: 0.4783550798892975, raw std dev: 0.2678655982017517
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
INFO: Size of dataset: Training 40000, Validation 10000, Test 10000
GradInit Args: adam, Iters 200, lr 1e-05
GradInit: Iter 10, obj iters 10, eta 1.000e-01, constraint count 0 loss: 2.369e+25 (2.569e+25), init loss: 6.910e+00 (6.910e+00), update loss 2.369e+24 (2.569e+24), total gnorm: 1.498e+03 (1.496e+03)	s_max: 1.00e+00	s_min: 1.00e+00	s_mean: 1.00e+00	s_weight_max: 1.00e+00	s_weight_min: 1.00e+00	s_weight_mean: 1.00e+00	
GradInit: Iter 20, obj iters 20, eta 1.000e-01, constraint count 0 loss: 1.532e+25 (2.359e+25), init loss: 6.911e+00 (6.911e+00), update loss 1.532e+24 (2.359e+24), total gnorm: 1.471e+03 (1.493e+03)	s_max: 1.00e+00	s_min: 1.00e+00	s_mean: 1.00e+00	s_weight_max: 1.00e+00	s_weight_min: 1.00e+00	s_weight_mean: 1.00e+00	
GradInit: Iter

Plot: Train, 0, 6.907, 0.00, 0.0
Plot: Val, 0, 6.909, 0.00, 0.0
Plot: Train, 1, 4.691, 0.97, 37.6
Plot: Val, 1, 4.684, 1.14, 37.6
Plot: Train, 2, 4.319, 2.59, 75.8
Plot: Val, 2, 4.433, 2.48, 75.8
Plot: Train, 3, 4.065, 6.79, 114.1
Plot: Val, 3, 4.070, 6.42, 114.1
Plot: Train, 4, 3.719, 9.35, 152.3
Plot: Val, 4, 3.934, 8.95, 152.3
Plot: Train, 5, 3.609, 12.16, 188.9
Plot: Val, 5, 3.611, 11.73, 188.9
Plot: Train, 6, 3.518, 14.88, 224.9
Plot: Val, 6, 3.581, 14.10, 224.9
Plot: Train, 7, 3.569, 17.48, 260.6
Plot: Val, 7, 3.592, 16.35, 260.6
Plot: Train, 8, 3.162, 19.11, 296.4
Plot: Val, 8, 3.499, 17.80, 296.4
Plot: Train, 9, 3.114, 21.90, 332.4
Plot: Val, 9, 3.371, 20.34, 332.4
Plot: Train, 10, 3.105, 23.21, 368.7
Plot: Val, 10, 3.111, 21.00, 368.7
Plot: Train, 11, 2.991, 21.70, 405.2
Plot: Val, 11, 3.278, 19.26, 405.2
Plot: Train, 12, 2.674, 26.87, 442.4
Plot: Val, 12, 3.248, 23.32, 442.4
Plot: Train, 13, 2.855, 28.15, 480.0
Plot: Val, 13, 2.967, 24.28, 480.0
Plot: Train, 14, 2.488, 29.73,

GradInit: Iter 120, obj iters 120, eta 1.000e-01, constraint count 0 loss: 2.218e+25 (2.511e+25), init loss: 6.910e+00 (6.910e+00), update loss 2.218e+24 (2.511e+24), total gnorm: 1.305e+03 (1.396e+03)	s_max: 1.01e+00	s_min: 9.88e-01	s_mean: 9.95e-01	s_weight_max: 1.01e+00	s_weight_min: 9.88e-01	s_weight_mean: 9.90e-01	
GradInit: Iter 130, obj iters 130, eta 1.000e-01, constraint count 0 loss: 2.653e+25 (2.523e+25), init loss: 6.910e+00 (6.910e+00), update loss 2.653e+24 (2.523e+24), total gnorm: 1.289e+03 (1.389e+03)	s_max: 1.01e+00	s_min: 9.87e-01	s_mean: 9.95e-01	s_weight_max: 1.01e+00	s_weight_min: 9.87e-01	s_weight_mean: 9.90e-01	
GradInit: Iter 140, obj iters 140, eta 1.000e-01, constraint count 0 loss: 8.839e+25 (2.532e+25), init loss: 6.909e+00 (6.910e+00), update loss 8.839e+24 (2.532e+24), total gnorm: 1.274e+03 (1.381e+03)	s_max: 1.01e+00	s_min: 9.86e-01	s_mean: 9.94e-01	s_weight_max: 1.01e+00	s_weight_min: 9.86e-01	s_weight_mean: 9.89e-01	
GradInit: Iter 150, obj iters 150,

Plot: Train, 45, 0.468, 83.50, 1682.4
Plot: Val, 45, 4.255, 28.05, 1682.4
Plot: Train, 46, 0.680, 75.77, 1720.7
Plot: Val, 46, 5.302, 25.92, 1720.7
Plot: Train, 47, 0.669, 84.06, 1758.7
Plot: Val, 47, 5.288, 26.94, 1758.7
Plot: Train, 48, 0.358, 82.45, 1796.7
Plot: Val, 48, 5.139, 26.76, 1796.7
Plot: Train, 49, 0.283, 91.38, 1834.6
Plot: Val, 49, 6.066, 26.98, 1834.6
Plot: Train, 50, 0.245, 90.07, 1872.6
Plot: Val, 50, 5.691, 26.11, 1872.6
Plot: Train, 51, 0.359, 84.14, 1911.0
Plot: Val, 51, 5.513, 24.67, 1911.0
Plot: Train, 52, 0.327, 93.68, 1949.5
Plot: Val, 52, 5.595, 27.07, 1949.5
Plot: Train, 53, 0.161, 92.97, 1988.0
Plot: Val, 53, 6.467, 26.05, 1988.0
Plot: Train, 54, 0.283, 93.48, 2026.4
Plot: Val, 54, 6.356, 26.40, 2026.4
Plot: Train, 55, 0.109, 96.49, 2064.7
Plot: Val, 55, 6.971, 27.36, 2064.7
Plot: Train, 56, 0.263, 94.22, 2103.1
Plot: Val, 56, 6.196, 25.76, 2103.1
Plot: Train, 57, 0.074, 98.04, 2141.4
Plot: Val, 57, 6.695, 27.02, 2141.4
Plot: Train, 58, 0.145, 94.64, 2179.8


Plot: Train, 0, 6.908, 0.00, 0.0
Plot: Val, 0, 6.908, 0.00, 0.0
Plot: Train, 1, 4.671, 1.04, 38.4
Plot: Val, 1, 4.667, 0.85, 38.4
Plot: Train, 2, 4.676, 1.00, 77.2
Plot: Val, 2, 4.630, 1.01, 77.2
Plot: Train, 3, 4.598, 1.00, 116.1
Plot: Val, 3, 4.617, 0.99, 116.1
Plot: Train, 4, 4.595, 1.04, 154.7
Plot: Val, 4, 4.628, 0.85, 154.7
Plot: Train, 5, 4.464, 2.15, 193.4
Plot: Val, 5, 4.378, 2.09, 193.4
Plot: Train, 6, 4.155, 3.40, 231.9
Plot: Val, 6, 4.275, 3.20, 231.9
Plot: Train, 7, 4.225, 5.78, 270.5
Plot: Val, 7, 4.063, 5.12, 270.5
Plot: Train, 8, 3.791, 8.45, 309.1
Plot: Val, 8, 3.926, 7.92, 309.1
Plot: Train, 9, 3.723, 9.76, 347.8
Plot: Val, 9, 3.884, 9.25, 347.8
Plot: Train, 10, 3.699, 11.55, 386.2
Plot: Val, 10, 3.681, 10.94, 386.2
Plot: Train, 11, 3.702, 9.75, 423.2
Plot: Val, 11, 3.733, 8.95, 423.2
Plot: Train, 12, 3.258, 12.47, 459.7
Plot: Val, 12, 3.803, 11.54, 459.7
Plot: Train, 13, 3.441, 13.75, 496.3
Plot: Val, 13, 3.583, 12.81, 496.3
Plot: Train, 14, 3.366, 14.71, 533.0
Plot:

GradInit: Iter 120, obj iters 120, eta 1.000e-01, constraint count 0 loss: 1.953e+25 (2.740e+25), init loss: 6.908e+00 (6.908e+00), update loss 1.953e+24 (2.740e+24), total gnorm: 1.824e+00 (9.986e+01)	s_max: 1.93e+00	s_min: 1.00e-02	s_mean: 5.98e-01	s_weight_max: 1.93e+00	s_weight_min: 1.00e-02	s_weight_mean: 1.96e-01	
GradInit: Iter 130, obj iters 130, eta 1.000e-01, constraint count 0 loss: 2.600e+25 (2.727e+25), init loss: 6.908e+00 (6.908e+00), update loss 2.600e+24 (2.727e+24), total gnorm: 1.824e+00 (9.232e+01)	s_max: 1.99e+00	s_min: 1.00e-02	s_mean: 6.00e-01	s_weight_max: 1.99e+00	s_weight_min: 1.00e-02	s_weight_mean: 2.01e-01	
GradInit: Iter 140, obj iters 140, eta 1.000e-01, constraint count 0 loss: 8.974e+25 (2.717e+25), init loss: 6.908e+00 (6.908e+00), update loss 8.974e+24 (2.717e+24), total gnorm: 1.824e+00 (8.585e+01)	s_max: 2.07e+00	s_min: 1.00e-02	s_mean: 6.02e-01	s_weight_max: 2.07e+00	s_weight_min: 1.00e-02	s_weight_mean: 2.05e-01	
GradInit: Iter 150, obj iters 150,

Plot: Train, 43, 4.607, 1.03, 1536.5
Plot: Val, 43, 4.608, 0.88, 1536.5
Plot: Train, 44, 4.527, 1.03, 1572.1
Plot: Val, 44, 4.572, 0.87, 1572.1
Plot: Train, 45, 4.584, 1.03, 1607.1
Plot: Val, 45, 4.550, 0.87, 1607.1
Plot: Train, 46, 4.541, 1.15, 1642.1
Plot: Val, 46, 4.586, 1.10, 1642.1
Plot: Train, 47, 4.582, 1.03, 1677.0
Plot: Val, 47, 4.550, 0.87, 1677.0
Plot: Train, 48, 4.491, 1.93, 1712.1
Plot: Val, 48, 4.492, 1.52, 1712.1
Plot: Train, 49, 4.324, 2.00, 1747.4
Plot: Val, 49, 4.456, 1.77, 1747.4
Plot: Train, 50, 4.408, 2.10, 1782.9
Plot: Val, 50, 4.348, 2.09, 1782.9
Plot: Train, 51, 4.412, 2.74, 1818.9
Plot: Val, 51, 4.381, 2.58, 1818.9
Plot: Train, 52, 4.378, 2.42, 1855.4
Plot: Val, 52, 4.416, 2.47, 1855.4
Plot: Train, 53, 4.429, 2.80, 1892.1
Plot: Val, 53, 4.369, 2.47, 1892.1
Plot: Train, 54, 4.147, 3.14, 1929.2
Plot: Val, 54, 4.312, 2.75, 1929.2
Plot: Train, 55, 4.322, 3.45, 1966.3
Plot: Val, 55, 4.313, 3.15, 1966.3
Plot: Train, 56, 4.489, 3.28, 2003.3
Plot: Val, 56, 4.209, 3.00,

Plot: Train, 0, 6.908, 0.00, 0.0
Plot: Val, 0, 6.908, 0.00, 0.0
Plot: Train, 1, 4.629, 1.03, 34.8
Plot: Val, 1, 4.624, 0.88, 34.8
Plot: Train, 2, 4.617, 1.01, 69.6
Plot: Val, 2, 4.607, 0.95, 69.6
Plot: Train, 3, 4.597, 1.04, 104.6
Plot: Val, 3, 4.607, 0.84, 104.6
Plot: Train, 4, 4.605, 1.01, 139.9
Plot: Val, 4, 4.623, 0.95, 139.9
Plot: Train, 5, 4.610, 1.03, 175.8
Plot: Val, 5, 4.617, 0.89, 175.8
Plot: Train, 6, 4.600, 1.04, 212.1
Plot: Val, 6, 4.610, 0.83, 212.1
Plot: Train, 7, 4.611, 1.03, 247.4
Plot: Val, 7, 4.615, 0.87, 247.4
Plot: Train, 8, 4.608, 0.99, 282.2
Plot: Val, 8, 4.611, 1.04, 282.2
Plot: Train, 9, 4.607, 1.00, 317.1
Plot: Val, 9, 4.607, 0.98, 317.1
Plot: Train, 10, 4.613, 0.99, 351.9
Plot: Val, 10, 4.611, 1.04, 351.9
Plot: Train, 11, 4.606, 1.03, 387.0
Plot: Val, 11, 4.616, 0.87, 387.0
Plot: Train, 12, 4.580, 1.03, 422.2
Plot: Val, 12, 4.614, 0.87, 422.2
Plot: Train, 13, 4.604, 1.01, 457.6
Plot: Val, 13, 4.607, 0.95, 457.6
Plot: Train, 14, 4.613, 1.03, 493.5
Plot: Val, 1

In [None]:
# gradinit_params = {
#     "gradinit_iters": 200,
#     "gradinit_alg": "adam", #sgd
#     "gradinit_lr": 1e-5,
#     "gradinit_grad_clip": 1,
# }
# model_params = {
#     "gradinit": gradinit_params,
#     # "convnorm" : True,
#     # "approx_mult" : 0.2,
#     # "importance_sampling" : True,
#     # "gamma" : 0.9
# }
# hyperparams = {
#     "lr" : 3e-5,
#     "num_epochs" : 60,
#     "weight_decay" : 0,
#     "train_ratio" : 0.8,
#     "batch_size" : 512,
# }


# def test_setup():
#   for iters in [50, 100, 200, 350, 500]:
#     gradinit_params['gradinit_iters'] = iters
#     model_params = {
#         "gradinit": gradinit_params,
#     }
#     train_model('VGG16', 'CIFAR100', model_params, hyperparams)

# test_setup()

In [13]:
1

1