In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from math import ceil
import torch
from torch.utils.data import DataLoader
from torch.autograd import Variable
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('..')
from utils.input_pipeline import get_image_folders
from utils.training import train
from utils.quantization import optimization_step, quantize, initial_scales

torch.cuda.is_available()

True

In [3]:
torch.backends.cudnn.benchmark = True

In [4]:
LEARNING_RATE = 1e-4  # learning rate for all possible weights
HYPERPARAMETER_T = 0.15  # hyperparameter for quantization

# Create data iterators

In [5]:
batch_size = 64

In [6]:
train_folder, val_folder = get_image_folders()

train_iterator = DataLoader(
    train_folder, batch_size=batch_size, num_workers=4,
    shuffle=True, pin_memory=True
)

val_iterator = DataLoader(
    val_folder, batch_size=256, num_workers=4,
    shuffle=False, pin_memory=True
)

# number of training samples
train_size = len(train_folder.imgs)
train_size

100000

# Model

In [7]:
from get_densenet import get_model

In [8]:
model, loss, optimizer = get_model(learning_rate=LEARNING_RATE)

# load pretrained model, accuracy ~73%
model.load_state_dict(torch.load('../vanilla_densenet_big/model_step5.pytorch_state'))

#### keep copy of full precision kernels

In [9]:
# copy almost all full precision kernels of the model
all_fp_kernels = [
    Variable(kernel.data.clone(), requires_grad=True) 
    for kernel in optimizer.param_groups[1]['params']
]
# all_fp_kernels - kernel tensors of all convolutional layers 
# (with the exception of the first conv layer)

#### initial quantization 

In [10]:
# scaling factors for each quantized layer
initial_scaling_factors = []

In [11]:
# these kernels will be quantized
all_kernels = [kernel for kernel in optimizer.param_groups[1]['params']]

In [12]:
for k, k_fp in zip(all_kernels, all_fp_kernels):
    
    # choose initial scaling factors 
    w_p_initial, w_n_initial = initial_scales(k_fp.data)
    initial_scaling_factors += [(w_p_initial, w_n_initial)]
    
    # do quantization
    k.data = quantize(k_fp.data, w_p_initial, w_n_initial, t=HYPERPARAMETER_T)

#### parameter updaters

In [13]:
# optimizer for updating only all_fp_kernels
optimizer_fp = optim.Adam(all_fp_kernels, lr=LEARNING_RATE)

In [14]:
# optimizer for updating only scaling factors
optimizer_sf = optim.Adam([
    Variable(torch.FloatTensor([w_p, w_n]).cuda(), requires_grad=True) 
    for w_p, w_n in initial_scaling_factors
], lr=LEARNING_RATE)

# Train

In [15]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

class lr_scheduler_list:
    """ReduceLROnPlateau for a list of optimizers."""
    def __init__(self, optimizer_list):
        self.lr_scheduler_list = [
            ReduceLROnPlateau(
                optimizer, mode='max', factor=0.1, patience=3, 
                verbose=True, threshold=0.01, threshold_mode='abs'
            ) 
            for optimizer in optimizer_list
        ]
    
    def step(self, test_accuracy):
        for scheduler in self.lr_scheduler_list:
            scheduler.step(test_accuracy)

n_epochs = 15
n_batches = ceil(train_size/batch_size)

# total number of batches in the train set
n_batches

1563

In [16]:
%%time
optimizer_list = [optimizer, optimizer_fp, optimizer_sf]

def optimization_step_fn(model, loss, x_batch, y_batch):
    return optimization_step(
        model, loss, x_batch, y_batch, 
        optimizer_list=optimizer_list,
        t=HYPERPARAMETER_T
    )
all_losses = train(
    model, loss, optimization_step_fn,
    train_iterator, val_iterator, n_epochs,
    lr_scheduler=lr_scheduler_list(optimizer_list)        
)
# epoch logloss  accuracy    top5_accuracy time  (first value: train, second value: val)

0  3.605 2.637  0.213 0.369  0.454 0.656  674.036
1  2.571 2.173  0.384 0.473  0.670 0.744  668.391
2  2.276 2.121  0.444 0.487  0.724 0.751  667.756
3  2.116 1.922  0.478 0.520  0.751 0.784  668.168
4  1.995 1.934  0.505 0.522  0.772 0.783  668.347
5  1.914 1.824  0.522 0.546  0.786 0.802  668.592
6  1.843 1.749  0.538 0.567  0.797 0.812  668.261
7  1.788 1.845  0.549 0.541  0.806 0.795  668.585
8  1.737 1.718  0.561 0.571  0.814 0.812  668.550
9  1.693 1.861  0.571 0.547  0.819 0.792  668.989
10  1.655 1.689  0.579 0.583  0.825 0.819  668.097
11  1.619 1.647  0.587 0.590  0.831 0.824  668.579
12  1.586 1.545  0.594 0.616  0.836 0.844  668.487
13  1.552 1.699  0.602 0.575  0.842 0.814  668.539
14  1.521 1.600  0.609 0.601  0.846 0.833  668.464
CPU times: user 2h 56min 43s, sys: 27min 18s, total: 3h 24min 2s
Wall time: 2h 47min 11s


In [17]:
# backup
model.cpu();
torch.save(model.state_dict(), 'model_ternary_quantization.pytorch_state')

# Continue training

In [18]:
# reduce learning rate
for optimizer in optimizer_list:
    for group in optimizer.param_groups:
        group['lr'] = 1e-5

In [22]:
n_epochs = 5
model.cuda();

In [23]:
%%time
def optimization_step_fn(model, loss, x_batch, y_batch):
    return optimization_step(
        model, loss, x_batch, y_batch, 
        optimizer_list=optimizer_list,
        t=HYPERPARAMETER_T
    )
all_losses = train(
    model, loss, optimization_step_fn,
    train_iterator, val_iterator, n_epochs       
)
# epoch logloss  accuracy    top5_accuracy time  (first value: train, second value: val)

0  1.312 1.362  0.660 0.654  0.875 0.866  649.648
1  1.243 1.341  0.675 0.661  0.885 0.868  651.109
2  1.222 1.366  0.678 0.657  0.889 0.866  651.324
3  1.207 1.334  0.682 0.664  0.890 0.870  651.090
4  1.195 1.350  0.685 0.659  0.891 0.869  651.020
CPU times: user 57min 55s, sys: 8min 45s, total: 1h 6min 41s
Wall time: 54min 14s


# Final save

In [24]:
model.cpu();
torch.save(model.state_dict(), 'model_ternary_quantization.pytorch_state')