In [1]:
import os
import sys
import pickle
import argparse

import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn

import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

from utils import avoidWarnings
from beautifultable import BeautifulTable as BT

avoidWarnings()

''' OPTIMIZER PARAMETERS - Analysis on those '''

test = False
best_acc = 0  
start_epoch = 0  
num_epochs = 200  ## TODO: set to args.epochs
batch_size = 128  ## TODO: set to args.barch
milestones = [100, 150]

L = 16
M = 32
E = 7
 
comments = True
n_workers = torch.multiprocessing.cpu_count()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpus = True if torch.cuda.device_count() > 1 else False
device_name = torch.cuda.get_device_name(0) if device == 'cuda' else 'CPUs'
    
table = BT()
table.append_row(['Python Version', sys.version[:5]])
table.append_row(['PyTorch Version', torch.__version__])
table.append_row(['Device', str(device_name)])
table.append_row(['Cores', str(n_workers)])
table.append_row(['GPUs', str(torch.cuda.device_count())])
table.append_row(['CUDNN Enabled', str(torch.backends.cudnn.enabled)])
table.append_row(['Architecture', 'DenseNet x7'])
table.append_row(['Dataset', 'CIFAR10'])
table.append_row(['Epochs', str(num_epochs)])
table.append_row(['Batch Size', str(batch_size)])

print(table)

+-----------------+----------------------+
| Python Version  |        3.6.5         |
+-----------------+----------------------+
| PyTorch Version |        1.0.0         |
+-----------------+----------------------+
|     Device      | Tesla V100-SXM2-16GB |
+-----------------+----------------------+
|      Cores      |          8           |
+-----------------+----------------------+
|      GPUs       |          1           |
+-----------------+----------------------+
|  CUDNN Enabled  |         True         |
+-----------------+----------------------+
|  Architecture   |     DenseNet x7      |
+-----------------+----------------------+
|     Dataset     |       CIFAR10        |
+-----------------+----------------------+
|     Epochs      |         200          |
+-----------------+----------------------+
|   Batch Size    |         128          |
+-----------------+----------------------+


In [2]:
# Data
# ----

avoidWarnings()
dataset = 'MNIST'
dataset = 'CIFAR'
from data import dataloaders
trainloader, testloader, classes = dataloaders(dataset, batch_size)

==> Preparing data..
Files already downloaded and verified
Files already downloaded and verified


In [3]:
# Models 
# ------
    
# For now, NO SHARING of any layers withing the ensemble

avoidWarnings()
from models import Conv_Net
from utils import count_parameters
net = Conv_Net('net', layers=L, filters=M)

print('Regular net')
print('\n\n\t\tParameters: {}M'.format(count_parameters(net)/1e6))

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-5)

Regular net


		Parameters: 0.174634M


In [4]:
# Training
# --------
    
#from utils import timeit
from results import TrainResults as Results
## Note: the paper doesn't mention about trainining iterations

def train(epoch):
    
    net.train()
    print('\nEpoch: %d' % epoch)

    total = 0
    correct = 0
    global results
    
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        
        optimizer.zero_grad()

        inputs, targets = inputs.to(device), targets.to(device)
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
    
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    accuracy = 100.*correct/total    
    results.append_loss(loss.item(), 'train')
    results.append_accy(accuracy, 'train')    
    print('Train :: Loss: {} | Accy: {}'.format(round(loss.item(),2), round(accuracy,2)))

        
def test(epoch):
    
    net.eval()

    total = 0
    correct = 0
    global results
    global best_acc

    with torch.no_grad():
        
        for batch_idx, (inputs, targets) in enumerate(testloader):
            
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
    # Save checkpoint.
    acc = 100.*correct/total
    results.append_loss(loss.item(), 'valid')
    results.append_accy(acc, 'valid')
    print('Valid :: Loss: {} | Accy: {}'.format(round(loss.item(),2), round(acc,2)))
    
    if acc > best_acc:
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.t7')
        best_acc = acc

In [5]:
import time
start = time.time()

results = Results([net])
net.to(device)
if device == 'cuda':
    net = torch.nn.DataParallel(net)
    cudnn.benchmark = True

print('[OK]: Starting Training of Single Model')
for epoch in range(start_epoch, num_epochs):
    
    # LR Scheduler
    if epoch == milestones[0] or epoch == milestones[1]:
        for p in optimizer.param_groups:  p['lr'] = p['lr'] / 10
        print('\n** Changing LR to {} \n'.format(p['lr'])) 
        
    # Run epoch
    train(epoch)
    test(epoch)
    
    # Timer
    print('Time: {}'.format(round((time.time() - start),2)))
    start = time.time()
    
    # Results backup
    if epoch % 20 == 0:
        with open('Results_Singe.pkl', 'wb') as object_result:
                pickle.dump(results, object_result, pickle.HIGHEST_PROTOCOL) 

    
results.show()

[OK]: Starting Training of Single Model

Epoch: 0
Train :: Loss: 2.3 | Accy: 9.74
Valid :: Loss: 2.3 | Accy: 10.0
Saving..
Time: 16.92

Epoch: 1
Train :: Loss: 2.3 | Accy: 9.79
Valid :: Loss: 2.3 | Accy: 10.0
Time: 8.71

Epoch: 2
Train :: Loss: 2.3 | Accy: 9.63
Valid :: Loss: 2.3 | Accy: 10.0
Time: 8.68

Epoch: 3
Train :: Loss: 2.3 | Accy: 9.57
Valid :: Loss: 2.3 | Accy: 10.0
Time: 9.06

Epoch: 4
Train :: Loss: 2.3 | Accy: 9.88
Valid :: Loss: 2.3 | Accy: 10.0
Time: 9.06

Epoch: 5
Train :: Loss: 2.3 | Accy: 9.68
Valid :: Loss: 2.3 | Accy: 10.0
Time: 8.68

Epoch: 6
Train :: Loss: 2.3 | Accy: 9.71
Valid :: Loss: 2.3 | Accy: 10.0
Time: 9.03

Epoch: 7
Train :: Loss: 2.3 | Accy: 9.7
Valid :: Loss: 2.3 | Accy: 10.0
Time: 9.16

Epoch: 8
Train :: Loss: 2.3 | Accy: 9.86
Valid :: Loss: 2.3 | Accy: 10.0
Time: 9.36

Epoch: 9
Train :: Loss: 2.3 | Accy: 9.77
Valid :: Loss: 2.3 | Accy: 10.0
Time: 8.7

Epoch: 10
Train :: Loss: 2.3 | Accy: 9.77
Valid :: Loss: 2.3 | Accy: 10.0
Time: 9.01

Epoch: 11
Train

KeyboardInterrupt: 

In [6]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(range(len(results.train_loss)), results.train_loss, label='Train')
plt.plot(range(len(results.valid_loss)), results.valid_loss, label='Valid')
plt.title('Loss')
plt.show()

plt.figure()
plt.plot(range(len(results.train_accy)), results.train_accy, label='Train')
plt.plot(range(num_epochs), results.valid_accy, label='Valid')

plt.title('Accuracy')
plt.show()

ValueError: x and y must have same first dimension, but have shapes (200,) and (31,)