***COMP 691 - Deep Learning project - Challenge 1***

*Team: DL_DT_Explore*

*Members*:
* Trong-Tuan Tran
* Hussein Abdallah
* Manh-Quoc-Dat Le

This Jupyter Notebook implements the approach: ResNet9 model with Cossine Loss function.

Here the goal is to train on 100 samples. In this preliminary testbed the evaluation will be done on a 2000 sample validation set. Note in the end the final evaluation will be done on the full CIFAR-10 test set as well as potentially a separate dataset. The validation samples here should not be used for training in any way, the final evaluation will provide only random samples of 100 from a datasource that is not the CIFAR-10 training data. 

Initial configurations & hyperparameters used for grid search (params defined in lists [ ] will be used for grid search):

In [23]:
import time
from numpy.random import RandomState
import torchvision
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import Subset
from torchvision import datasets, transforms

# Epochs: 300 - lr: - 0.001 - dropout: 0 - Weight_decay: 1e-05 - Grad_clip: 0.005 
# Scenario 17/300 - Epochs: 700 - lr: - 0.0001 - dropout: 0 - Weight_decay: 0.00016051911333587627 - Grad_clip: 0.015119336467640998
# Scenario 19/300 - Epochs: 700 - lr: - 0.0001 - dropout: 0 - Weight_decay: 0.00016051911333587627 - Grad_clip: 0.02576638574613588

epochs_list = [700]
grad_clips = [0.02576638574613588] # Gradient clipping
weight_decays = [0.00016051911333587627] # Weight decay for Adam Optimizer
lrs = [0.0001] # Learning rates
drop_outs = [0] # Value for drop out layers
batch_size = 128
runs = 5 # Number of instances to run the train and test to evaluate mean and std dev of test accuracy
epoch_display_range = 100
search_plot = True # Define whether to run evaluation after each epochs to plot accuracy trend or not
log_enabled = False # Define if log down progress to support resume from previous completed run or not (for grid search)
save_image = True # If search_plot is True, enable this will save the plotted image to the img_path define below  
google_drive_mount = True # If running in Google Colab and enable this, progress and image files will save to Google Drive instead of local file
final_eval = True # Whether to run using train set or test set for final evalutation & submission

eval_str = 'DEV PHASE - ' if not final_eval else 'FINAL EVAL - '
comment = eval_str + 'ResNet9 + Cosine Loss + modified train transforms + random search best config' # comment string to put in acc trend images
if final_eval:
    search_plot = False
    save_image = False
    google_drive_mount = False
    log_enabled = False

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
#device = torch.device('cpu')

The section below detects which environment is running (Colab, Kaggle or local computer). The output folders will be determine accordingly. If running in final_eval mode, no output file will be generated.

In [24]:
import os
output_path = 'output_txt/'
img_path = 'img/'
Colab = False
Kaggle = 'kaggle' in os.getcwd()
root = '.' # Root to download dataset
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    Colab = True  
    from google.colab import drive
    if not os.path.exists('/content/drive/MyDrive/') and google_drive_mount:
        drive.mount('/content/drive', force_remount=False)

    else:
        if google_drive_mount:
            print('Drive already mounted at at /content/drive')

    Google_path = '/content/drive/MyDrive/Colab Notebooks/COMP691_project/' if google_drive_mount else '/'
    if not os.path.exists(Google_path):
        os.mkdir(Google_path)
    img_path = Google_path + img_path
    if not os.path.exists(img_path):
        os.mkdir(img_path)
    output_path = Google_path + output_path  
else:
    if Kaggle:
        root = '../input/cifar10'
        output_path = ''
        img_path = ''
        print('Running in Kaggle')
    else:
        print('Not running on CoLab or Kaggle')
output_file_name = 'report_ADAM_cosine_improve_FINAL.txt'
output_file_path = output_path + output_file_name
progress_file = output_path + 'grid_search_progress_FINAL.txt'
img_file_name_prefix = output_file_name.replace('.txt', '')
img_file_path = img_path + img_file_name_prefix + '/'

if not final_eval:
    if not os.path.exists(img_path):
        os.mkdir(img_path)

    if not os.path.exists(img_file_path):
        os.mkdir(img_file_path)

    if not os.path.exists(output_path):
        os.mkdir(output_path)

Running on CoLab


Setup training/testing and other helper functions

In [25]:
import gc
from matplotlib import pyplot as plt 

def train(model, device, train_loader, optimizer, epoch, grad_clip=None, sched=None, display=True):
    model.train()
    loss_function = nn.CosineEmbeddingLoss()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data).to(device)

        GT=torch.zeros((len(target),10))
        for idx in range(len(target)):
            GT[idx][target[idx]]=1

        GT=GT.to(device)
        
        loss = loss_function(output, GT, torch.Tensor(output.size(0)).to(device).fill_(1.0))
        #loss = F.cross_entropy(output, target)
        loss.backward()
        if grad_clip:
            nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        if sched:
            sched.step()
        if display and (batch_idx == 0 or batch_idx + 1 == len(train_loader)):
          print('   Train Epoch: {} [step {}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
              epoch + 1, batch_idx + 1, len(train_loader),
              100. * batch_idx / len(train_loader), loss.detach().item()))
        if device == torch.device('cuda'):
            del loss, output
            gc.collect()
            torch.cuda.empty_cache()

def test(model, device, test_loader, display=True):
    model.eval()
    test_loss = 0
    correct = 0
    loss_function = nn.CosineEmbeddingLoss()
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            GT=torch.zeros((len(target),10))
            for idx in range(len(target)):
                GT[idx][target[idx]]=1

            GT=GT.to(device)
            
            test_loss += loss_function(output, GT, torch.Tensor(output.size(0)).to(device).fill_(1.0)).item() # sum up batch loss
            #test_loss += F.cross_entropy(output, target, size_average=False).item()
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    if display:
        print('   Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))
    return 100. * correct / len(test_loader.dataset)

def plot_accs(accs_accross_runs, display_str, comment='N/A', save_img=True):
    plt.figure();
    #epochs = list(range(1, len(accs_accross_runs[0]) + 1))
    max_acc_display = ''
    for i, run_accs in enumerate(accs_accross_runs):
        max_acc = max(run_accs)
        max_epochs = [index + 1 for index, acc in enumerate(run_accs) if acc == max_acc]
        if len(max_epochs) > 3:
            not_display_count = len(max_epochs) - 3
            max_epochs = str(max_epochs[:3]) + f'... + {not_display_count} more'
            
        max_acc_display = f' - max acc: {max_acc}% at epochs {max_epochs}'
        plt.plot(run_accs, label=f'Run #{i + 1}' + max_acc_display)
    plt.xlabel('epochs')
    plt.ylabel('Test accuracy (%)')
    plt.legend();
    plt.title(f'Test accuracies for \n{display_str}Note: {comment}');

    if save_img:
        scenario = display_str[display_str.index(' ') + 1: display_str.index('/')]
        img_name = img_file_path + f'{scenario}' + generate_image_suffix()
        plt.savefig(img_name, bbox_inches='tight')

def generate_image_suffix():
    return f'_{time.time()%10000000:.0f}' + '.png'

Definition of ResNet9 model:

In [26]:
import torch.nn as nn 
import torch.nn.functional as F
num_classes = 10
in_channels = 3

def conv_block(in_channels, out_channels, drop_out=0, pool=False):
    layers = [nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1), 
              nn.BatchNorm2d(out_channels), 
              nn.ReLU(inplace=True), nn.Dropout(drop_out)
              ]
    if pool: layers.append(nn.MaxPool2d(2))
    return nn.Sequential(*layers)

class NET(nn.Module):
    def __init__(self, in_channels, num_classes, drop_out):
        super().__init__()
        
        self.conv1 = conv_block(in_channels, 64, drop_out)
        self.conv2 = conv_block(64, 128, drop_out, pool=True)
        self.res1 = nn.Sequential(conv_block(128, 128, drop_out), conv_block(128, 128, drop_out))
        self.dropout = nn.Dropout(drop_out)
        self.conv3 = conv_block(128, 256, drop_out, pool=True)
        self.conv4 = conv_block(256, 512, drop_out, pool=True)
        self.res2 = nn.Sequential(conv_block(512, 512, drop_out), conv_block(512, 512, drop_out))
        self.conv5 = conv_block(512, 1028, drop_out, pool=True)
        self.res3 = nn.Sequential(conv_block(1028, 1028, drop_out), conv_block(1028, 1028, drop_out))
        
        self.classifier = nn.Sequential(nn.MaxPool2d(2), 
                                        nn.Flatten(), 
                                        nn.Linear(1028, num_classes))

    def forward(self, xb):
        out = self.conv1(xb)
        out = self.conv2(out)
        out = self.res1(out) + out
        out = self.conv3(out)
        out = self.dropout(out)
        out = self.conv4(out)
        out = self.dropout(out)
        out = self.res2(out) + out
        out = self.conv5(out)
        out = self.res3(out) + out
        out = self.classifier(out)
        return out



The below tries a numbers of random problem instances defined in `runs` variable at the beginning 

In [27]:
%%time

device_name = torch.cuda.get_device_name(0) if device == torch.device('cuda') else 'cpu'

scenario_count = len(epochs_list) * len(weight_decays) * len(lrs) * len(drop_outs) * len(grad_clips)

# Statistic for CIFAR-10 datasets
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                  std=[0.229, 0.224, 0.225])
# Data augmentation during training:
transform_train = transforms.Compose([
                                    transforms.RandomCrop(32, padding=4, padding_mode='reflect'),
                                    transforms.RandomGrayscale(),
                                    transforms.RandomHorizontalFlip(),
                                    torchvision.transforms.RandomAffine(degrees=30),
                                    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), 
                                    transforms.ToTensor(), 
                                    normalize]) #careful to keep this one same
transform_val = transforms.Compose([transforms.ToTensor(), normalize]) 


print('Running on {}'.format(device_name))

##### Cifar Data
run_on_train_set = not final_eval
dataset = 'train' if run_on_train_set else 'test'
print(f'Using CIFAR-10 {dataset} set')
print(comment)
cifar_data = datasets.CIFAR10(root='.', train=run_on_train_set, transform=transform_train, download=True)
    
#We need two copies of this due to weird dataset api 
cifar_data_val = datasets.CIFAR10(root='.', train=run_on_train_set, transform=transform_val, download=True)



training_done = False
count = 1
scenario = 1
next_run = 1
previous_runs_accs = []
previous_train_times = []
previous_eval_times = []
previous_exec_times = []
ran_in_middle = False

if log_enabled:
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as file_read:
            progress_content = file_read.readlines()
        # print(progress_content)
        if progress_content[0].replace('\n', '') == output_file_name:
            previous_scenario = progress_content[1].replace('\n', '')
            previous_runs_accs = eval(progress_content[2].replace('\n', ''))
            previous_runs = len(previous_runs_accs)
            print(f'Previous progress on {output_file_name} stopped at scenario {previous_scenario}/{scenario_count}' +\
                 f', run {previous_runs}/{runs}')
            if previous_runs == runs: # Already complete the previous scenario
                scenario = int(previous_scenario) + 1

            else: # Previous scenario just completed partially, resume in the next run
                ran_in_middle = True
                scenario = int(previous_scenario)
                next_run = previous_runs + 1
                previous_execution_times = eval(progress_content[3].replace('\n', ''))

                for i, previous_execution_time in enumerate(previous_execution_times):
                    previous_train_times.append(previous_execution_time[0]) 
                    previous_eval_times.append(previous_execution_time[1]) 
                    previous_exec_times.append(previous_execution_time[2])  
            if scenario > scenario_count:
                training_done = True
                print('Training was already done!')
            else:
                print(f'Will resume training at scenario: {scenario}, run# {next_run}')

if not training_done:
    for epochs in epochs_list:
        for lr in lrs:
            for drop_out in drop_outs:
                for weight_decay in weight_decays:
                    for grad_clip in grad_clips:

                        if not ran_in_middle: 

                            accs = []
                            train_times = []
                            evaluation_times = []
                            total_times = []
                            run_execution_times = []
                        else:
                            if count < scenario:
                                count += 1
                                continue #skip until reaching the scenario to run
                            accs = previous_runs_accs
                            train_times = previous_train_times
                            evaluation_times = previous_eval_times
                            total_times = previous_exec_times
                            run_execution_times = previous_execution_times
                        #scenario += 1

                        print('\nScenario %d/%d - Epochs: %d - lr: - %s - dropout: %s - Weight_decay: %s - Grad clip: %s'%(
                            scenario, scenario_count, epochs, lr, drop_out, weight_decay, grad_clip
                        ))
                        accs_accross_runs_plot = []
                        for seed in range(next_run, runs + 1):
                            start_time = time.time()
                            # Extract a subset of 100 (class balanced) samples per class for training and 2000 samples for validation
                            permute_range = 5000 if run_on_train_set else 1000
                            prng = RandomState(seed)
                            random_permute = prng.permutation(np.arange(0, permute_range))
                            indx_train = np.concatenate([np.where(np.array(cifar_data.targets) == classe)[0][random_permute[0:10]] for classe in range(0, 10)])
                            indx_val = np.concatenate([np.where(np.array(cifar_data_val.targets) == classe)[0][random_permute[10:210]] for classe in range(0, 10)])


                            train_data = Subset(cifar_data, indx_train)
                            val_data = Subset(cifar_data_val, indx_val)

                            print('  Run# [%d/%d] - Num Samples For Training %d - Num Samples For Val %d'%(seed, runs, train_data.indices.shape[0],val_data.indices.shape[0]))

                            train_loader = torch.utils.data.DataLoader(train_data,
                                                                        batch_size=batch_size, 
                                                                        shuffle=True)

                            val_loader = torch.utils.data.DataLoader(val_data,
                                                                    batch_size=batch_size, 
                                                                    shuffle=False)

                            model = NET(in_channels, num_classes, drop_out)
                            model.to(device)
                            optimizer = torch.optim.Adam(model.parameters(), 
                                                        lr=lr, 
                                                        #momentum=0.9,
                                                        weight_decay=weight_decay)
                            sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, lr, epochs=epochs, 
                                                                            steps_per_epoch=len(train_loader))
                            test_accs = []
                            eval_time = 0
                            for epoch in range(epochs):
                                print_condition = epoch%epoch_display_range==0 or epoch==epochs-1
                                train(model, device, train_loader, optimizer, epoch, grad_clip=grad_clip,
                                    sched=sched, display=print_condition)
                                if search_plot:
                                    eval_start = time.time()
                                    test_acc = test(model, device, val_loader, display=print_condition)
                                    eval_time = time.time() - eval_start
                                    test_accs.append(test_acc)

                                    
                            train_time = time.time() - start_time    
                            train_times.append(train_time)
                            final_eval_start = time.time()
                            final_acc = test_accs[-1] if search_plot else test(model, device, val_loader)
                            accs.append(final_acc)
                            final_eval_time = eval_time if search_plot else time.time() - final_eval_start
                            evaluation_times.append(final_eval_time)
                            if search_plot:
                                accs_accross_runs_plot.append(test_accs)
                            total_time = time.time() - start_time
                            total_times.append(total_time)
                            run_execution_times.append((train_time, final_eval_time, total_time))
                            if log_enabled:
                                progress_str = f'{output_file_name}\n{scenario}\n{accs}\n{run_execution_times}'
                                with open(progress_file, 'w') as progress_write:
                                    progress_write.write(progress_str)
                            if device == torch.device('cuda'):
                                del optimizer
                                gc.collect()
                                torch.cuda.empty_cache()
                            print('  Run execution time: train: %.3f (s) - eval: %.3f (s)- total: %.3f (s)'%\
                                  (train_time, final_eval_time, total_time))
                        accs = np.array(accs)
                        train_times = np.array(train_times)
                        evaluation_times = np.array(evaluation_times)
                        total_times = np.array(total_times)
                        scenario_description = 'Scenario %d/%d - Epochs: %d - lr: - %s - dropout: %s - Weight_decay: %s - Grad_clip: %s'%\
                        (scenario, scenario_count, epochs, lr, drop_out, weight_decay, grad_clip)
                        accuracy_description = '\n  Final acc over %d instances: %.2f +- %.2f%%\n'%(runs, accs.mean(), accs.std())
                        # print(train_times.mean(), evaluation_times.mean(), total_times.mean())
                        display_str = '  %s'%(scenario_description) +\
                        '\n  Avg execution time: train: %.3f +- %.3f (s) - eval: %.3f +- %.3f (s) - total: %.3f +- %.3f (s) on %s'%\
                        (train_times.mean(), train_times.std(), evaluation_times.mean(), evaluation_times.std(),
                             total_times.mean(), total_times.std(), device_name) + accuracy_description
                        
                        #progress_str = f'{output_file_name}\n{scenario}\n{accs}'
                        print(display_str)
                        plot_str = display_str # scenario_description + accuracy_description
                        if search_plot:
                            plot_accs(accs_accross_runs_plot, plot_str, comment, save_image)
                        if log_enabled:
                            mode = 'a' if os.path.exists(output_file_path) else 'w'

                            with open(output_file_path, mode) as output_write:
                                output_write.write(display_str)
                        ran_in_middle = False
                        next_run = 1
                        scenario += 1

Running on Tesla P100-PCIE-16GB
Using CIFAR-10 test set
FINAL EVAL - ResNet9 + Cosine Loss + modified train transforms + random search best config
Files already downloaded and verified
Files already downloaded and verified

Scenario 1/1 - Epochs: 700 - lr: - 0.0001 - dropout: 0 - Weight_decay: 0.00016051911333587627 - Grad clip: 0.02576638574613588
  Run# [1/5] - Num Samples For Training 100 - Num Samples For Val 2000
   Test set: Average loss: 0.0044, Accuracy: 729/2000 (36.45%)
  Run execution time: train: 155.332 (s) - eval: 0.764 (s)- total: 156.096 (s)
  Run# [2/5] - Num Samples For Training 100 - Num Samples For Val 2000
   Test set: Average loss: 0.0042, Accuracy: 759/2000 (37.95%)
  Run execution time: train: 154.445 (s) - eval: 0.759 (s)- total: 155.204 (s)
  Run# [3/5] - Num Samples For Training 100 - Num Samples For Val 2000
   Test set: Average loss: 0.0043, Accuracy: 780/2000 (39.00%)
  Run execution time: train: 154.273 (s) - eval: 0.771 (s)- total: 155.044 (s)
  Run# [4/