In [62]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from mlfunctions import MNISTNet, cache, do_fit,fit_optimizer, MNISTLoss, w, Optimizer
import numpy as np
import torch.optim as optim
import glob
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
from torch.autograd import Variable
import matplotlib.pyplot as plt
import random
from tqdm import tqdm
import multiprocessing
import os.path
import csv
import copy
import joblib
from torchvision import datasets
import torchvision
import seaborn as sns; sns.set(color_codes=True)
sns.set_style("white")
from pdb import set_trace as bp
from meta_module import MetaLinear, MetaModule,MetaConv2d,MetaConvTranspose2d, MetaBatchNorm2d
import functools

from torchsummary import summary

## CIFAR-10 training 

First, change the optimizer structure using the MetaConv2D. 

In [72]:
class CIFAR10Loss:
    def __init__(self, training=True):
        dataset = datasets.CIFAR10(
            '.\ourwork\data', train=True, download=False,
            transform=torchvision.transforms.ToTensor()
        )
        indices = list(range(len(dataset)))
        np.random.RandomState(10).shuffle(indices)
        if training:
            indices = indices[:len(indices) // 2]
        else:
            indices = indices[len(indices) // 2:]

        self.loader = torch.utils.data.DataLoader(
            dataset, batch_size=128,
            sampler=torch.utils.data.sampler.SubsetRandomSampler(indices))

        self.batches = []
        self.cur_batch = 0
        
    def sample(self):
        if self.cur_batch >= len(self.batches):
            self.batches = []
            self.cur_batch = 0
            for b in self.loader:
                self.batches.append(b)
        batch = self.batches[self.cur_batch]
        self.cur_batch += 1
        return batch

class CIFAR10Net(MetaModule):

    def __init__(self, layer_size=32, n_layers=3, **kwargs):
        super().__init__()


        # add linear layers 
        
        # for i in range(n_layers):
        #     self.layers[f'mat_{i}'] = MetaLinear(inp_size, layer_size)
        #     inp_size = layer_size

        self.layers = {}
        # Main layers (Convolutions + MaxPooling)
        in_channels = 3
        self.hidden_channels = [3,5,8,16]

        for i in range(1,n_layers+1):
            self.layers[f'conv_{i}'] = MetaConv2d(in_channels, self.hidden_channels[i], kernel_size=3)
            self.layers[f'norm_{i}'] = nn.BatchNorm2d(self.hidden_channels[i-1], affine=False, track_running_stats=False)
            in_channels = self.hidden_channels[i]

        self.activation = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2,)
        

        # Linear Layers 
        self.layers['linear'] = MetaLinear(16*2*2 , layer_size)

        self.layers = nn.ModuleDict(self.layers)

        self.loss = nn.NLLLoss()

    def all_named_parameters(self):
        return [(k, v) for k, v in self.named_parameters()]
    
    def forward(self, loss):
        inp, out = loss.sample()
        inp = w(Variable(inp.view(inp.size()[0], 3,32,32)))    # reshapes from inp.shape
        out = w(Variable(out))


        for cur_layer in range(1,4):    #since the layers are defined from 1...
            # print("Shape before conv2D - kernel = 3 \t" + str(inp.shape))
            inp = self.layers[f'norm_{cur_layer}'](inp)    #affine = False makes the batch nor parameters not-learnable (keep it simple)
            inp = self.layers[f'conv_{cur_layer}'](inp)
            inp = self.pool(inp)
            inp = self.activation(inp)
            
            cur_layer += 1
            
        # print("Shape after CNN \t" + str(inp.shape))
        # inp = inp.view(-1, 16*2*2)  # flatten
        inp = torch.flatten(inp,start_dim=1)
        
        inp = F.log_softmax(self.layers['linear'](inp), dim=1)

        #debugging....
        # print(inp.shape,out.shape)
        l = self.loss(inp, out)

        return l

After the optmizee is done, we re-build the optmizer with athe new architecture: 2 LSTMs in series - one forthe conv2Dand one for the lienear (i.e.) fully connected. 

In [60]:

class Optimizer(nn.Module):
    def __init__(self, preproc=False, hidden_sz=20, preproc_factor=10.0):
        super().__init__()
        self.hidden_sz = hidden_sz
        if preproc:
            self.recurs = nn.LSTMCell(2, hidden_sz)
        else:
            self.recurs = nn.LSTMCell(1, hidden_sz)
        self.recurs2 = nn.LSTMCell(hidden_sz, hidden_sz)
        self.output = nn.Linear(hidden_sz, 1)
        self.preproc = preproc
        self.preproc_factor = preproc_factor
        self.preproc_threshold = np.exp(-preproc_factor)

    def forward(self, inp, hidden, cell):
        if self.preproc:
            # Implement preproc described in Appendix A

            # Note: we do all this work on tensors, which means
            # the gradients won't propagate through inp. This
            # should be ok because the algorithm involves
            # making sure that inp is already detached.
            inp = inp.data
            inp2 = w(torch.zeros(inp.size()[0], 2))
            keep_grads = (torch.abs(inp) >= self.preproc_threshold).squeeze()
            inp2[:, 0][keep_grads] = (torch.log(torch.abs(inp[keep_grads]) + 1e-8) / self.preproc_factor).squeeze()
            inp2[:, 1][keep_grads] = torch.sign(inp[keep_grads]).squeeze()

            inp2[:, 0][~keep_grads] = -1
            inp2[:, 1][~keep_grads] = (float(np.exp(self.preproc_factor)) * inp[~keep_grads]).squeeze()
            inp = w(Variable(inp2))
        hidden0, cell0 = self.recurs(inp, (hidden[0], cell[0]))
        hidden1, cell1 = self.recurs2(hidden0, (hidden[1], cell[1]))
        
        return self.output(hidden1), (hidden0, hidden1), (cell0, cell1)


Now let's try to fit CIFAR classifier.

In [56]:
@cache.cache
def get_fit_dict_test(n_tests, opt_dict, *args, **kwargs):
    opt = w(Optimizer(preproc=True))
    opt.load_state_dict(opt_dict)
    np.random.seed(0)
    return [do_fit(opt, *args, **kwargs) for _ in tqdm(range(N_TESTS), 'optimizer')]


@cache.cache
def fit_normal(target_cls, target_to_opt, opt_class, n_tests=100, n_epochs=100, **kwargs):
    results = []
    for i in tqdm(range(n_tests), 'tests'):
        target = target_cls(training=False)
        optimizee = w(target_to_opt())
        optimizer = opt_class(optimizee.parameters(), **kwargs)
        total_loss = []
        for _ in range(n_epochs):
            loss = optimizee(target)
            
            total_loss.append(loss.data.cpu().numpy())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        results.append(total_loss)

        
    return results

Optimizer training is done here:

In [71]:
# for lr in tqdm(sorted([1.0, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003, 0.0001, 0.00003, 0.00001], key=lambda x: np.abs(x - 0.003)), 'all'):
#     print('Trying lr:', lr)

loss, cifar10_optimizer = fit_optimizer(CIFAR10Loss, CIFAR10Net, lr=0.03,  n_epochs=20, n_tests=20, out_mul=0.1, preproc=True)



epochs:   0%|          | 0/20 [00:00<?, ?it/s]

iterations:   0%|          | 0/20 [00:00<?, ?it/s]

TypeError: cannot assign 'torch.cuda.FloatTensor' as parameter 'weight' (torch.nn.Parameter or None expected)

In [51]:
@cache.cache
def get_fit_dict_test(n_tests, opt_dict, *args, **kwargs):
    opt = w(Optimizer(preproc=True))
    opt.load_state_dict(opt_dict)
    np.random.seed(0)
    return [do_fit(opt, *args, **kwargs) for _ in tqdm(range(N_TESTS), 'optimizer')]


fit_data[:, :, len(OPT_NAMES)] = np.array(get_fit_dict_test(N_TESTS, mnist_optimizer, None, MNISTLoss, MNISTNet, 1, 200, 200, out_mul=0.1, should_train=False))

747.4585

326.70215