In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

path = os.path.join(os.getcwd(), 'Best_trained_models')        
if not os.path.exists(path):
    os.mkdir(path)
    
path = os.path.join(os.getcwd(), 'Plot_curves')        
if not os.path.exists(path):
    os.mkdir(path)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **optimizer.py**

In [None]:
import torch
from torch.optim import Optimizer


class AdaBelief(Optimizer):
    def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=5e-4,
                 weight_decouple=True, fixed_decay=False, correct_bias=True):

        if lr < 0:
            raise ValueError("Invalid learning rate")
        if not 0 <= betas[0] <= 1:
            raise ValueError("Invalid beta 0")
        if not 0 <= betas[1] <= 1:
            raise ValueError("Invalid beta 1")
        if not 0 <= eps:
            raise ValueError("Invalid epsilon")
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                       correct_bias=correct_bias)

        super(AdaBelief, self).__init__(params, defaults)
        self.weight_decouple = weight_decouple
        self.fixed_decay = fixed_decay
    
    def reset(self):
        for group in self.param_groups:
            for param in group['params']:
                state = self.state[param]
                state['step']=0
                state['exp_avg'] = torch.zeros_like(param.data,memory_format=torch.preserve_format)
                state['exp_avg_sq'] = torch.zeros_like(param.data,memory_format=torch.preserve_format)

    def step(self, closure=None):
        for group in self.param_groups:
            for param in group['params']:
                if param.grad is None:
                    continue
                grad = param.grad.data

                if grad.is_sparse:
                    raise RuntimeError('Grads are sparse')

                state = self.state[param]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(param.data)
                    state['exp_avg_sq'] = torch.zeros_like(param.data)

#                 Weight Decay    
                if self.weight_decouple:
                    if not self.fixed_decay:
                        param.data.mul_(1.0 - group['lr'] * group['weight_decay'])
                    else:
                        param.data.mul_(1.0 - group['weight_decay'])
                else:
                    if group['weight_decay'] != 0:
                        grad.add_(param.data, alpha=group['weight_decay'])
                
                
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta_1, beta_2 = group['betas']
                state['step'] += 1

                m_t = torch.add(torch.mul(beta_1, exp_avg), torch.mul(1.0 - beta_1, grad))
                # s_t = torch.add(torch.mul(beta_2, exp_avg_sq) , torch.mul(1.0-beta_2,torch.square(grad)) )
                s_t = torch.add(torch.add(torch.mul(beta_2, exp_avg_sq),
                                          torch.mul(1.0 - beta_2, torch.square(torch.sub(grad, m_t)))), group['eps'])

                if group['correct_bias']:
                    m_t_hat = m_t.divide(1.0 - beta_1 ** state['step'])
                    s_t_hat = s_t.divide(1.0 - beta_2 ** state['step'])

                denom = torch.add(torch.sqrt(s_t_hat), group['eps'])
                param.data.addcdiv_(m_t_hat, denom, value=-group['lr'])

# **models.py**

In [None]:
import torch
import torch.nn as nn
import operator
import functools
import math

class VGG(nn.Module):
    '''
    VGG model
    '''

    def __init__(self, architecture, num_classes=100, input_dims=[3, 32, 32]):
        super(VGG, self).__init__()
        self.architecture = architecture
        self.num_classes = num_classes
        self.input_dims = input_dims
        self.convs = self.conv()
        self.fcs = self.fc()
        
#         for m in self.modules():
#             if isinstance(m, nn.Conv2d):
#                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
#                 m.weight.data.normal_(0, math.sqrt(2. / n))
#                 m.bias.data.zero_()

    def conv(self):
        layers = []
        input_channels = self.input_dims[0]

        for value in self.architecture:
            if type(value) == int:
                layers += [nn.Conv2d(in_channels=input_channels, out_channels=value, kernel_size=3, padding=1),
                           nn.BatchNorm2d(value),nn.ReLU(inplace=True)]
                input_channels = value
            else:
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]

        return nn.Sequential(*layers)

    def fc(self):

        features_size = functools.reduce(operator.mul, list(self.convs(torch.rand(1, *self.input_dims)).shape))

        return nn.Sequential(nn.Dropout(p=0.5),
                            nn.Linear(features_size, 512),
                             nn.ReLU(inplace=True),
                             nn.Dropout(p=0.5),
                             nn.Linear(512, 512),
                             nn.ReLU(inplace=True),
                             
                             nn.Linear(512, self.num_classes)
                             )

    def forward(self, x):
        x = self.convs(x)
        x = x.view(x.size(0), -1)
        x = self.fcs(x)
        return x

    
# Used inside ResNet
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super().__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
                     padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=100):
        super().__init__()
        
        self.inplanes = 64

        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 , num_classes)


    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None  
   
        if stride != 1 or self.inplanes != planes:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, 1, stride, bias=False),
                nn.BatchNorm2d(planes),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        
        self.inplanes = planes
        
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.conv1(x)           
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)         

        x = self.layer1(x)          
        x = self.layer2(x)          
        x = self.layer3(x)          
        x = self.layer4(x)          

        x = self.avgpool(x)         
        x = torch.flatten(x, 1)     
        x = self.fc(x)

        return x
# model = ResNet(BasicBlock,[3,4,6,3])

# main.py

In [None]:
import os
import pickle
from torch import optim, nn
# from optimizer import AdaBelief
# from models import VGG
import torch
import torchvision.transforms as transforms
import torchvision
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def adjust_learning_rate(optimizer,gamma=0.1,reset=True):
    for param_group in optimizer.param_groups:
        param_group['lr'] *= gamma
    if optimizer.__class__.__name__ == 'AdaBelief' and reset:
        optimizer.reset()
    elif optimizer.__class__.__name__ == 'Adam' and reset:
        for group in optimizer.param_groups:
            for param in group['params']:
                state = optimizer.state[param]
                state['step']=torch.zeros((1,),dtype=torch.float, device=param.device)
                state['exp_avgs'] = torch.zeros_like(param.data,memory_format=torch.preserve_format)
                state['exp_avg_sq'] = torch.zeros_like(param.data,memory_format=torch.preserve_format)
    elif optimizer.__class__.__name__ == 'SGD' and reset:
        for group in optimizer.param_groups:
            for param in group['params']:
                state = optimizer.state[param]
                state['step']=0
                state['momentum_buffer'] = torch.zeros_like(param.data,memory_format=torch.preserve_format)
                
        
def initialize_optimizer(inp_model, optimizer='SGD'):
    if optimizer == 'Adam':
        return optim.Adam(inp_model.parameters(), lr=0.001,weight_decay=5e-4)
    elif optimizer == 'SGD':
        return optim.SGD(inp_model.parameters(), lr=0.001, momentum=0.9,weight_decay=5e-4)
    elif optimizer == 'AdaBelief':
        return AdaBelief(inp_model.parameters(), lr=0.0001)


def build_model(model_type):
    network = None
    if model_type == "VGG":
        VGG11 = [64, "MP", 128, "MP", 256, 256, "MP", 512, 512, "MP", 512, 512, "MP"]
        network = VGG(VGG11).to(device)
    elif model_type == 'ResNet':
        layers = [3,4,6,4]
        network = ResNet(BasicBlock, layers).to(device)
    if device == 'cuda':
        network = torch.nn.DataParallel(network)

    return network


def cross_entropy_loss_function():
    return nn.CrossEntropyLoss()


def get_data(batch_size=128):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    cifar_train_data = torchvision.datasets.CIFAR100(root='./data/', train=True,
                                                    download=True, transform=transform_train)

    cifar_test_data = torchvision.datasets.CIFAR100(root='./data/', train=False,
                                                   download=True, transform=transform_test)

    cifar_train_loader = DataLoader(cifar_train_data, batch_size=batch_size, shuffle=True)
    cifar_test_loader = DataLoader(cifar_test_data, shuffle=False, batch_size=batch_size)

    return cifar_train_loader, cifar_test_loader


def test(net, test_data, criterion):
    correct = 0
    total = 0
    test_loss = 0

    net.eval()

    with torch.no_grad():
        for data in test_data:
            images, labels = data
            images, labels = images.to(device), labels.to(device)

            outputs = net(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test accuracy {accuracy}%")

    return accuracy, test_loss


def train(net, epoch, train_data, optimizer, criterion):
    net.train()
    correct = 0
    total = 0
    train_loss = 0.0

    print('\nEpoch: %d' % epoch)

    for i, data in enumerate(train_data):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Training accuracy {accuracy}%")

    return accuracy, train_loss


def main(dataset, model_architecture, init_optimizer):
    train_loader, test_loader = get_data()

    net = build_model(model_architecture)
    criterion = cross_entropy_loss_function()
    optimizer = initialize_optimizer(net, init_optimizer)

    start = 1
    end = 200
    best_acc = 0
    train_accuracies = []
    test_accuracies = []
    train_loss_trends = []
    test_loss_trends = []

    for epoch in range(start, end+1):

        if epoch==150:
            adjust_learning_rate(optimizer,reset=False)
        train_acc, train_loss = train(net, epoch, train_loader, optimizer, criterion)
        test_acc, test_loss = test(net, test_loader, criterion)

        if test_acc > best_acc:
            state = {
                'net': net.state_dict(),
                'acc': test_acc,
                'epoch': epoch,
            }
            file_path = os.path.join(os.getcwd() + "/Best_trained_models/"+f"{dataset}_{model_architecture}_{init_optimizer}.pt" )
            torch.save(state,file_path)
            best_acc = test_acc

        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        train_loss_trends.append(train_loss)
        test_loss_trends.append(test_loss)

    pickle.dump({'train_acc': train_accuracies, 'test_acc': test_accuracies, 'train_loss': train_loss_trends,
                 'test_loss': test_loss_trends}, open(
        os.path.join(os.getcwd() + "/Plot_curves",  f"{dataset}_{model_architecture}_{init_optimizer}.p"),"wb"))





In [None]:
main("CIFAR-10", "ResNet", "SGD")

In [None]:
pickle.dump(files['CIFAR-10_ResNet_AdaBelief'],open('./Pickle3/CIFAR-10_ResNet_AdaBelief.p','wb'))

In [None]:
import pickle
import os
import matplotlib.pyplot as plt

file_names = [name for name in os.listdir('./Plot_curves')]
print(file_names)
files = {}
for names in file_names:
    files[names.split(".")[0]] = pickle.load(open('./Plot_curves/' + names , "rb"))
files['CIFAR-10_ResNet_SGD']