In [None]:
!pip install utils

Collecting utils
  Downloading utils-1.0.2.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: utils
  Building wheel for utils (setup.py) ... [?25l[?25hdone
  Created wheel for utils: filename=utils-1.0.2-py2.py3-none-any.whl size=13905 sha256=cc6ffe52789905371da71be531d759bb81f9a3cdce1318f8abe05c744ebb753c
  Stored in directory: /root/.cache/pip/wheels/b8/39/f5/9d0ca31dba85773ececf0a7f5469f18810e1c8a8ed9da28ca7
Successfully built utils
Installing collected packages: utils
Successfully installed utils-1.0.2


In [None]:
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torchvision
from tqdm import tqdm
import utils
import torch
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CNN, self).__init__()
        self.features = nn.Sequential(
            # BEGIN Solution (do not delete this comment!)

            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.Conv2d(64, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.BatchNorm2d(64),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.Conv2d(128, 128, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.BatchNorm2d(128),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # END Solution (do not delete this comment!)
        )
        self.classifier = nn.Sequential(
            # BEGIN Solution (do not delete this comment!)

            nn.Linear(4096, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, num_classes)

            # END Solution (do not delete this comment!)
        )

    def forward(self, x):
        # BEGIN Solution (do not delete this comment!)

        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [None]:
def epoch_train(loader, clf, criterion, opt):

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    clf = clf.to(device)

    clf.train(True)
    total_loss = 0
    correct = 0
    total = 0
    for model_input, target in loader:

        model_input = model_input.to(device)
        target = target.to(device)

        model_output = clf(model_input)
        loss = criterion(model_output, target)

        #accuracy = utils.get_accuracy(clf, loader, device)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.item()
        _, predicted = model_output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

    avg_loss = total_loss / len(loader)
    avg_accuracy = correct / total
    return avg_loss, avg_accuracy

def epoch_test(loader, clf, criterion):

    clf.eval()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    clf = clf.to(device)
    total_loss = 0
    correct = 0
    total = 0
    for model_input, target in loader:
        model_input = model_input.to(device)
        target = target.to(device)

        model_output = clf(model_input)
        loss = criterion(model_output, target)

        total_loss += loss.item()
        _, predicted = model_output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

    avg_loss = total_loss / len(loader)
    avg_accuracy = correct / total
    return avg_loss, avg_accuracy

def train(train_loader, test_loader, clf, criterion, opt, n_epochs=50):
    for epoch in tqdm(range(n_epochs)):
        train_loss, train_acc = epoch_train(train_loader, clf, criterion, opt)
        test_loss, test_acc = epoch_test(test_loader, clf, criterion)


# WideResNet Translate

In [None]:
class WideResNet(nn.Module):
    def __init__(self, depth, widen_factor, input_shape, output_size, drop_rate=0.0):
        super(WideResNet, self).__init__()

        def conv(channels, strides):
            return nn.Sequential(
                nn.BatchNorm2d(channels),
                nn.ReLU(),
                nn.Conv2d(channels, channels, kernel_size=3, stride=strides, padding=1, bias=False),
                nn.Dropout(drop_rate) if drop_rate > 0 else nn.Identity(),
                nn.BatchNorm2d(channels),
                nn.ReLU(),
                nn.Conv2d(channels, channels, kernel_size=3, stride=1, padding=1, bias=False)
            )

        def resize(x, shape):
            if x.size() == shape:
                return x
            channels = shape[1]
            strides = x.size(2) // shape[2]
            return nn.Conv2d(x.size(1), channels, kernel_size=1, stride=strides, padding=0, bias=False)(x)

        def block(channels, widen_factor, n, strides):
            layers = []
            for i in range(n):
                layers.append(conv(channels*widen_factor, strides if i == 0 else 1))
                layers.append(nn.ReLU())
            return nn.Sequential(*layers)

        n = int((depth-4)/6)

        self.group0 = nn.Conv2d(input_shape[0], 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.group1 = block(16, widen_factor, n, 1)
        self.group2 = block(32, widen_factor, n, 2)
        self.group3 = block(64, widen_factor, n, 2)

        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, output_size)

    def forward(self, x):
        x = self.group0(x)
        x = self.group1(x)
        x = self.group2(x)
        x = self.group3(x)

        x = F.relu(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return F.softmax(x, dim=1)

# From scratch


In [None]:
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride, drop_rate):
        super(BasicBlock, self).__init__()

        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)

        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)

        self.dropout = nn.Dropout(p=drop_rate) if drop_rate > 0 else nn.Identity()

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
            )
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        out = F.relu(self.bn1(x))
        out = self.conv1(out)

        out = F.relu(self.bn2(out))
        out = self.conv2(out)

        out = self.dropout(out)

        out += self.shortcut(x)
        return out

class WideResNet_or(nn.Module):
    def __init__(self, depth, widen_factor, drop_rate, num_classes):
        super(WideResNet, self).__init__()

        n = (depth - 4) // 6
        k = widen_factor

        n_stages = [16, 16*k, 32*k, 64*k]

        self.conv1 = nn.Conv2d(3, n_stages[0], kernel_size=3, stride=1, padding=1, bias=False)

        self.layer1 = self._wide_layer(BasicBlock, n_stages[0], n_stages[1], n, 1, drop_rate)
        self.layer2 = self._wide_layer(BasicBlock, n_stages[1], n_stages[2], n, 2, drop_rate)
        self.layer3 = self._wide_layer(BasicBlock, n_stages[2], n_stages[3], n, 2, drop_rate)

        self.bn = nn.BatchNorm2d(n_stages[3])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(n_stages[3], num_classes)

    def _wide_layer(self, block, in_channels, out_channels, num_blocks, stride, drop_rate):
        layers = []

        for i in range(num_blocks):
            layers.append(block(in_channels if i == 0 else out_channels,
                                out_channels,
                                stride if i == 0 else 1,
                                drop_rate))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = F.relu(self.bn(x))

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)

        x = self.fc(x)

        return F.log_softmax(x, dim=1)


# Sampler

In [None]:
from torch.optim import Optimizer
import copy

class SVRG_k(Optimizer):
    r"""Optimization class for calculating the gradient of one iteration.
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
    """
    def __init__(self, params, lr, weight_decay=0):
        print("Using optimizer: SVRG")
        self.u = None
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight decay: {}".format(weight_decay))
        defaults = dict(lr=lr, weight_decay=weight_decay)
        super(SVRG_k, self).__init__(params, defaults)

    def get_param_groups(self):
            return self.param_groups

    def set_u(self, new_u):
        """Set the mean gradient for the current epoch.
        """
        if self.u is None:
            self.u = copy.deepcopy(new_u)
        for u_group, new_group in zip(self.u, new_u):
            for u, new_u in zip(u_group['params'], new_group['params']):
                u.grad = new_u.grad.clone()

    def step(self, params):
        """Performs a single optimization step.
        """
        for group, new_group, u_group in zip(self.param_groups, params, self.u):
            weight_decay = group['weight_decay']

            for p, q, u in zip(group['params'], new_group['params'], u_group['params']):
                if p.grad is None:
                    continue
                if q.grad is None:
                    continue
                # core SVRG gradient update
                new_d = p.grad.data - q.grad.data + u.grad.data
                if weight_decay != 0:
                    new_d.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], new_d)


class SVRG_Snapshot(Optimizer):
    r"""Optimization class for calculating the mean gradient (snapshot) of all samples.
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
    """
    def __init__(self, params):
        defaults = dict()
        super(SVRG_Snapshot, self).__init__(params, defaults)

    def get_param_groups(self):
            return self.param_groups

    def set_param_groups(self, new_params):
        """Copies the parameters from another optimizer.
        """
        for group, new_group in zip(self.param_groups, new_params):
            for p, q in zip(group['params'], new_group['params']):
                  p.data[:] = q.data[:]


class AverageCalculator():
    def __init__(self):
        self.reset()

    def reset(self):
        self.count = 0
        self.sum = 0
        self.avg = 0

    def update(self, val, n=1):
        assert(n > 0)
        self.sum += val * n
        self.count += n
        self.avg = self.sum / float(self.count)

def train_epoch_SVRG(model_k,
                     model_snapshot,
                     optimizer_k,
                     optimizer_snapshot,
                     train_loader,
                     loss_fn,
                     device = "cuda",
                     data_transform = lambda x:x):
    model_k.train()
    model_snapshot.train()
    loss = AverageCalculator()


    # calculate the mean gradient
    optimizer_snapshot.zero_grad()  # zero_grad outside for loop, accumulate gradient inside
    for points, labels in train_loader:
        points = points.to(device)
        points = data_transform(points)

        yhat = model_snapshot(points)
        labels = labels.to(device)
        snapshot_loss = loss_fn(yhat, labels) / len(train_loader)
        snapshot_loss.backward()

    # pass the current paramesters of optimizer_0 to optimizer_k
    u = optimizer_snapshot.get_param_groups()
    optimizer_k.set_u(u)

    for points, labels in train_loader:
        points = points.to(device)
        points = data_transform(points)

        yhat = model_k(points)
        labels = labels.to(device)
        loss_iter = loss_fn(yhat, labels)

        # optimization
        optimizer_k.zero_grad()
        loss_iter.backward()

        yhat2 = model_snapshot(points)
        loss2 = loss_fn(yhat2, labels)

        optimizer_snapshot.zero_grad()
        loss2.backward()

        optimizer_k.step(optimizer_snapshot.get_param_groups())
        # logging
        loss.update(loss_iter.data.item())

    # update the snapshot
    optimizer_snapshot.set_param_groups(optimizer_k.get_param_groups())

    return loss.avg

# Sampler 2

In [None]:

def accuracy(yhat, labels):
    _, indices = yhat.max(1)
    return (indices == labels).sum().data.item() / float(len(labels))

class AverageCalculator():
    def __init__(self):
        self.reset()

    def reset(self):
        self.count = 0
        self.sum = 0
        self.avg = 0

    def update(self, val, n=1):
        assert(n > 0)
        self.sum += val * n
        self.count += n
        self.avg = self.sum / float(self.count)



class SVRG_k(Optimizer):
    r"""Optimization class for calculating the gradient of one iteration.
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
    """
    def __init__(self, params, lr, weight_decay=0):
        print("Using optimizer: SVRG")
        self.u = None
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if weight_decay < 0.0:
            raise ValueError("Invalid weight decay: {}".format(weight_decay))
        defaults = dict(lr=lr, weight_decay=weight_decay)
        super(SVRG_k, self).__init__(params, defaults)

    def get_param_groups(self):
            return self.param_groups

    def set_u(self, new_u):
        """Set the mean gradient for the current epoch.
        """
        if self.u is None:
            self.u = copy.deepcopy(new_u)
        for u_group, new_group in zip(self.u, new_u):
            for u, new_u in zip(u_group['params'], new_group['params']):
                u.grad = new_u.grad.clone()

    def step(self, params):
        """Performs a single optimization step.
        """
        for group, new_group, u_group in zip(self.param_groups, params, self.u):
            weight_decay = group['weight_decay']

            for p, q, u in zip(group['params'], new_group['params'], u_group['params']):
                if p.grad is None:
                    continue
                if q.grad is None:
                    continue
                # core SVRG gradient update
                new_d = p.grad.data - q.grad.data + u.grad.data
                if weight_decay != 0:
                    new_d.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], new_d)


class SVRG_Snapshot(Optimizer):
    r"""Optimization class for calculating the mean gradient (snapshot) of all samples.
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
    """
    def __init__(self, params):
        defaults = dict()
        super(SVRG_Snapshot, self).__init__(params, defaults)

    def get_param_groups(self):
            return self.param_groups

    def set_param_groups(self, new_params):
        """Copies the parameters from another optimizer.
        """
        for group, new_group in zip(self.param_groups, new_params):
            for p, q in zip(group['params'], new_group['params']):
                  p.data[:] = q.data[:]




def train_epoch_SVRG(model_k,
                     model_snapshot,
                     optimizer_k,
                     optimizer_snapshot,
                     train_loader,
                     loss_fn,
                     metric = accuracy,
                     device = "cuda",
                     data_transform = lambda x:x):
    model_k.train()
    model_snapshot.train()
    loss = AverageCalculator()
    av_metric = AverageCalculator()

    # calculate the mean gradient
    optimizer_snapshot.zero_grad()  # zero_grad outside for loop, accumulate gradient inside
    for points, labels in train_loader:
        points = points.to(device)
        points = data_transform(points)

        pred = model_snapshot(points)
        labels = labels.to(device)
        snapshot_loss = loss_fn(pred, labels) / len(train_loader)
        snapshot_loss.backward()

    # pass the current paramesters of optimizer_0 to optimizer_k
    u = optimizer_snapshot.get_param_groups()
    optimizer_k.set_u(u)

    for points, labels in train_loader:
        points = points.to(device)
        points = data_transform(points)

        pred_1 = model_k(points)
        labels = labels.to(device)
        loss_iter = loss_fn(pred_1, labels)

        # optimization
        optimizer_k.zero_grad()
        loss_iter.backward()

        pred_2 = model_snapshot(points)
        loss_2 = loss_fn(pred_2, labels)

        optimizer_snapshot.zero_grad()
        loss_2.backward()

        optimizer_k.step(optimizer_snapshot.get_param_groups())

        loss.update(loss_iter.data.item())
        av_metric.update(metric(pred_1, labels))

    # update the snapshot
    optimizer_snapshot.set_param_groups(optimizer_k.get_param_groups())

    return loss.avg, av_metric.avg



def test_epoch_SVRG(model, test_loader, loss_fn, metric = accuracy, device = "cuda", data_transform = lambda x:x):
    """One epoch of validation
    """
    model.eval()
    loss = AverageCalculator()
    av_metric = AverageCalculator()
    with torch.no_grad():
        for points, labels in test_loader:
            points = points.to(device)
            points = data_transform(points)
            pred = model(points)
            labels = labels.to(device)

            loss_iter = loss_fn(pred, labels)
            av_metric.update(metric(pred, labels))
            loss.update(loss_iter.data.item())


    return loss.avg, av_metric.avg

# Training

In [None]:
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

In [None]:
input_shape = (3, 32, 32)  # Assuming CIFAR-10 dataset
output_size = 10

# Create an instance of the WideResNet model with depth=28 and widen_factor=10
model = WideResNet(depth=16, widen_factor=4, drop_rate=0.0, input_shape=input_shape, output_size=output_size)

In [None]:
print('Number of weights:', np.sum([np.prod(p.shape) for p in model.parameters()]))

Number of weights: 3101242


In [None]:
mean, std = [0.5, 0.5, 0.5] , [0.5, 0.5, 0.5]
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

train_set = CIFAR10(root='./data', train=True, download=True, transform=train_transform)
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)

test_set = CIFAR10(root='./data', train=False, download=True, transform=test_transform)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 49573637.57it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

print('Train size', len(train_set))
print('Test size', len(test_set))

n_epochs = 10

Train size 50000
Test size 10000


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model_snapshot = CNN().to(device)

loss_fn = nn.CrossEntropyLoss()

optimizer = SVRG_k(model.parameters(), lr=0.02, weight_decay = 0.02)
optimizer_snapshot = SVRG_Snapshot(model_snapshot.parameters())

Using optimizer: SVRG


In [None]:
for epoch in tqdm(range(n_epochs)):
    train_loss, train_acc = train_epoch_SVRG(model, model_snapshot, optimizer, optimizer_snapshot, train_loader, loss_fn,device='cpu')
    test_loss, test_acc = test_epoch_SVRG(model, test_loader,  loss_fn)
    print(f'[Epoch {epoch + 1}] train loss: {train_loss:.3f}; train acc: {train_acc:.2f}; ' +
          f'test loss: {test_loss:.3f}; test acc: {test_acc:.2f}')

  0%|          | 0/10 [25:26<?, ?it/s]


RuntimeError: running_mean should contain 16 elements not 64