# Reset training to reach state-of-art results

In [1]:
import os
import glob
import pickle
import multiprocessing
from beautifultable import BeautifulTable as BT

# Switching multiprocessing to avoid "Too many files opened"
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler


import sys
sys.path.append('..')
sys.path.append('ResNets')
from utils import load_dataset, count_parameters


import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', 'ImportWarning')
warnings.filterwarnings('ignore', 'DeprecationWarning')

In [3]:
''' 
CONFIGURATION 
-------------

Catch from the parser all the parameters to define the training
'''
print('\n\nCONFIGURATION')
print('-------------')

dataset = 'CIFAR10'
testing = False             # Activate test to run few iterations per epoch       
comments = True             # Activate printing comments
ensemble_type = 'Big'       # Single model big 
#ensemble_type = 'Huge'     # Single model huge
batch_size = 128
n_epochs = 300
n_iters = 64000
learning_rate = 0.001

momentum = 0.9
weight_decay = 1e-4

# GPU if CUDA is available
cuda = torch.cuda.is_available()
n_workers = multiprocessing.cpu_count()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpus = True if torch.cuda.device_count() > 1 else False
mem = False if device == 'cpu' else True

table = BT()
table.append_row(['Python Version', sys.version[:5]])
table.append_row(['PyTorch Version', torch.__version__])
table.append_row(['Cuda', str(cuda)])
table.append_row(['Device', str(device)])
table.append_row(['Cores', str(n_workers)])
table.append_row(['GPUs', str(torch.cuda.device_count())])
table.append_row(['CUDNN Enabled', str(torch.backends.cudnn.enabled)])
print(table)





CONFIGURATION
-------------
+-----------------+-------+
| Python Version  | 3.6.5 |
+-----------------+-------+
| PyTorch Version | 1.0.0 |
+-----------------+-------+
|      Cuda       | True  |
+-----------------+-------+
|     Device      | cuda  |
+-----------------+-------+
|      Cores      |   4   |
+-----------------+-------+
|      GPUs       |   1   |
+-----------------+-------+
|  CUDNN Enabled  | True  |
+-----------------+-------+


In [4]:
!nvidia-smi

Tue Jan  8 07:53:44 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   37C    P8    25W / 149W |     11MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [5]:
'''
DEFININTION OF PATHS 
--------------------
Define all the paths to load / save files
Ensure all those paths are correctly defined before moving on
'''

print('DEFINITION OF PATHS')
print('-------------------')
scripts = os.getcwd()
root = os.path.abspath(os.path.join(scripts, '../'))
results = os.path.abspath(os.path.join(root, 'results'))
data_path = os.path.abspath(os.path.join(root, '../datasets'))

path_to_logs = os.path.join(results, 'logs', 'resnets')
path_to_models = os.path.join(results, 'models', 'resnets')
path_to_figures = os.path.join(results, 'figures', 'resnets')
path_to_definitives = os.path.join(path_to_models, 'definitives')

train_log = os.path.join(path_to_logs, 'train')
test_log = os.path.join(path_to_logs, 'test')

assert os.path.exists(root), 'Root folder not found'
assert os.path.exists(scripts), 'Scripts folder not found'
assert os.path.exists(results), 'Results folder not found'
assert os.path.exists(data_path), 'Data folder not found'
assert os.path.exists(path_to_logs), 'Logs folder not found'
assert os.path.exists(path_to_models), 'Models folder not found'
assert os.path.exists(path_to_figures), 'Figure folder not found'
assert os.path.exists(path_to_definitives), 'Def. models folder not found'

print('Paths Validated')
print('---------------')
print('Root path: ', root)
print('Script path: ', scripts)
print('Results path: ', results)
print('DataFolder path: ', data_path)
print('Models to save path: ', path_to_models)
print('Models to load path: ', path_to_definitives)

paths = {
    'root': root, 
    'script': scripts,
    'data': data_path,
    'resulsts': results,
    'logs': {'train': train_log, 'test': test_log}, 
    'models': path_to_models,
    'definitives': path_to_definitives,
    'figures': path_to_figures
}

DEFINITION OF PATHS
-------------------
Paths Validated
---------------
Root path:  /home/ec2-user/Single_vs_Ensemble_of_NNs
Script path:  /home/ec2-user/Single_vs_Ensemble_of_NNs/ResNets
Results path:  /home/ec2-user/Single_vs_Ensemble_of_NNs/results
DataFolder path:  /home/ec2-user/datasets
Models to save path:  /home/ec2-user/Single_vs_Ensemble_of_NNs/results/models/resnets
Models to load path:  /home/ec2-user/Single_vs_Ensemble_of_NNs/results/models/resnets/definitives


In [6]:
# 1 - Import the Dataset
# ----------------------

print('IMPORTING DATA')
print('--------------')

dataset = 'CIFAR10'
comments=True
train_set, valid_set, test_set = load_dataset(data_path, dataset, comments=comments)

train_loader = DataLoader(dataset = train_set.dataset, 
                          sampler=SubsetRandomSampler(train_set.indices),
                          batch_size = batch_size, num_workers=n_workers,
                          pin_memory = mem)

valid_loader = DataLoader(dataset = valid_set.dataset, 
                          sampler=SubsetRandomSampler(valid_set.indices),
                          batch_size = batch_size, num_workers=n_workers,
                          pin_memory = mem)

test_loader = DataLoader(dataset = test_set, batch_size = 1,
                         shuffle = False, num_workers=n_workers, pin_memory = mem)


batches = len(train_loader)
samples = len(train_loader.sampler.indices) 

IMPORTING DATA
--------------
Files already downloaded and verified
Loading dataset:  CIFAR10
+--------------+-------+
| Train Images | 45000 |
+--------------+-------+
| Valid Images | 5000  |
+--------------+-------+
| Test Images  | 10000 |
+--------------+-------+
|   Classes    |  10   |
+--------------+-------+


In [7]:
# 2 - Import the ResNet
# ---------------------

print('\n\nIMPORTING MODELS')
print('----------------')

from resnets_Paper import ResNet20, ResNet32, ResNet44, ResNet56, ResNet110

resnet20 = ResNet20()
resnet32 = ResNet32()
resnet44 = ResNet44()
resnet56 = ResNet56()
resnet110 = ResNet110()

def parameters(model, typ=None):
    def compare_to_simplest(model, typ):
        simplest = count_parameters(resnet20)
        if typ is None: return count_parameters(model) / simplest
    return count_parameters(model)*1e-6, compare_to_simplest(model, typ)


table = BT()
table.append_row(['Model', 'M. Paramars', '% over ResNet20'])
table.append_row(['ResNet 20', *parameters(resnet20)])
table.append_row(['ResNet 32', *parameters(resnet32)])
table.append_row(['ResNet 44', *parameters(resnet44)])
table.append_row(['ResNet 56', *parameters(resnet56)])
table.append_row(['ResNet 110', *parameters(resnet110)])
if comments: print(table)



IMPORTING MODELS
----------------
+------------+-------------+-----------------+
|   Model    | M. Paramars | % over ResNet20 |
+------------+-------------+-----------------+
| ResNet 20  |    0.272    |       1.0       |
+------------+-------------+-----------------+
| ResNet 32  |    0.467    |      1.714      |
+------------+-------------+-----------------+
| ResNet 44  |    0.661    |      2.427      |
+------------+-------------+-----------------+
| ResNet 56  |    0.856    |      3.141      |
+------------+-------------+-----------------+
| ResNet 110 |    1.731    |      6.352      |
+------------+-------------+-----------------+


In [8]:
# Apply constraint - Parameters constant

small = count_parameters(ResNet20())  # 3:1 vs 6:1
singleModel = ResNet56() if ensemble_type == 'Big' else ResNet110() 
ensemble_size = round(count_parameters(singleModel) / small)


# Construct the single model

singleModel = ResNet56() if ensemble_type == 'Big' else ResNet110() # 3:1 vs 6:1
title = singleModel.name

name = singleModel.name
singleModel.to(device)
if gpus: singleModel = nn.DataParallel(singleModel)
optimizer = optim.SGD(singleModel.parameters(), learning_rate, momentum, weight_decay)


# Construct the ensemble

names = []
ensemble = []
optimizers = []
for i in range(ensemble_size):
    
    model = ResNet20()
    names.append(model.name + '_' + str(i+1))
    params = optim.SGD(model.parameters(), learning_rate, momentum, weight_decay)
    optimizers.append(params)
    
    model.to(device)
    if gpus: model = nn.DataParallel(model)
    ensemble.append(model)

In [17]:
# Load Best Models
# ----------------

from collections import OrderedDict

def get_epoch(pth):
    pth = pth.split('/')[-1] # remove all the path
    pth = pth[:-4] # remove .pkl
    epoch = pth.split('-')[1] # get just the epoch
    print('Epoch to restart training: ', epoch)
    return epoch

def load_weights(path, verbose=0):
    global device
    print('\n\nLoading Weights from: ', path)
    print('-------')
    state_dict = torch.load(path, map_location=device)
    if verbose == 1: print('\nCurrent dict: ', state_dict.keys())
    new_state_dict = OrderedDict()
    for k,v in state_dict.items():
        name = k[7:] # remove module.
        new_state_dict[name] = v
    if verbose == 1: print('\nNew dict: ', new_state_dict.keys())
    return new_state_dict

## LOAD TRAINED MODELS      -->         args.pretrained = -P = True
print('Loading trained models... ')

# Load saved models
if ensemble_type == 'Big':
    pth = os.path.join(path_to_definitives, 'resnet56')
else:
    pth = os.path.join(path_to_definitives, 'resnet110')
    e_epoch = get_epoch(pth)

assert os.path.exists(pth), 'Model to load not found'

ps = glob.glob(os.path.join(pth, '*.pkl'))
print('Files to load:')
for p in ps:
    print('...', p[-25:])

Loading trained models... 
Files to load:
... snet56/ResNet20_3-149.pkl
... snet56/ResNet20_2-149.pkl
... resnet56/ResNet56-161.pkl
... snet56/ResNet20_1-149.pkl


In [29]:
single_ps = [p for p in ps if 'ResNet56' in p]
ensemble_ps = [p for p in ps if 'ResNet56' not in p]
print('Single: ', single_ps[0][-25:])

Single:  resnet56/ResNet56-161.pkl


In [30]:
# Single Model
print('Getting ready Single Model : ', singleModel)
s_epoch = int(get_epoch(single_ps[0]))
singleModel.load_state_dict(load_weights(single_ps[0], verbose=0))
print('[OK] Single model loaded on epoch ', s_epoch)

Getting ready Single Model :  ResNet(
  (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
    )
    (1): BasicBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2):

In [31]:
# Ensemble Members
e_epoch = int(get_epoch(ensemble_ps[0]))
for i,p in enumerate(ensemble_ps):                
    ensemble[i].load_state_dict(load_weights(p, verbose=0))  
print('[OK] Ensemble loaded on epoch ', e_epoch)

Epoch to restart training:  149


Loading Weights from:  /home/ec2-user/Single_vs_Ensemble_of_NNs/results/models/resnets/definitives/resnet56/ResNet20_3-149.pkl
-------


Loading Weights from:  /home/ec2-user/Single_vs_Ensemble_of_NNs/results/models/resnets/definitives/resnet56/ResNet20_2-149.pkl
-------


Loading Weights from:  /home/ec2-user/Single_vs_Ensemble_of_NNs/results/models/resnets/definitives/resnet56/ResNet20_1-149.pkl
-------
[OK] Ensemble loaded on epoch  149


### Reset training of Single Deep Model

In [34]:
# Reset Models from saved Epoch
# -----------------------------

print('\n\nTRAINING')
print('--------')

save = False
criterion = nn.CrossEntropyLoss().cuda() if cuda else nn.CrossEntropyLoss()

# Big Single Model
     
cudnn.benchmark = False    
cudnn.benchmark = True

from train_reset import train as tr
print('Starting Single Model Training...' )
params = [dataset, name, singleModel, optimizer, criterion, device, train_loader,
          valid_loader, s_epoch, n_epochs, n_iters, save, paths, testing]

results = tr(*params)
with open('Results_Loaded_Single_Models.pkl', 'wb') as object_result:
    pickle.dump(results, object_result, pickle.HIGHEST_PROTOCOL)

results.show()



TRAINING
--------
Starting Single Model Training...


OSError: [Errno 24] Too many open files

## Reset training of Ensemble Model

In [None]:
# Ensemble Model
    
cudnn.benchmark = False    
cudnn.benchmark = True
from train_reset_ensemble import train as tre
print('Starting Ensemble Training...')

params = [dataset, names, ensemble, optimizers, criterion, device, train_loader,
          valid_loader, e_epoch, n_epochs, n_iters, save, paths, testing]

ens_results = tre(*params)
with open('Results_Loaded_Ensemble_Models.pkl', 'wb') as object_result:
    pickle.dump(ens_results, object_result, pickle.HIGHEST_PROTOCOL)

ens_results.show()