In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import configs
from data.qmul_loader import get_batch, train_people, test_people
from io_utils import parse_args_regression, get_resume_file
from methods.maml import MAML
from projection import create_random_projection_matrix, proj_sketch
import backbone
import os
import numpy as np
            
class parameters():
    def __init__(self):
        self.seed = 0
        self.model = "Conv3"
        self.method = "iMAML"
        self.dataset = "QMUL"
        self.start_epoch = 0
        self.stop_epoch = 100
        
params = parameters()

np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

params.checkpoint_dir = '%scheckpoints/%s/' % (configs.save_dir, params.dataset)
if not os.path.isdir(params.checkpoint_dir):
    os.makedirs(params.checkpoint_dir)
params.checkpoint_dir = '%scheckpoints/%s/%s_%s' % (configs.save_dir, params.dataset, params.model, params.method)

bb               = backbone.Conv3().cuda()
simple_net       = backbone.simple_net().cuda()
simple_net_multi = backbone.simple_net_multi_output().cuda()

combined_network       = backbone.CombinedNetwork(bb, simple_net).cuda()
combined_network_multi = backbone.CombinedNetwork(bb, simple_net_multi).cuda()

print(f"This is {params.method}, with {params.stop_epoch} epochs")

This is iMAML, with 100 epochs, and kernel rbf


In [10]:
import os
import numpy as np
import scipy
import torch
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import pickle
import csv


def to_cuda(x):
    try:
        return x.cuda()
    except:
        return torch.from_numpy(x).float().cuda()


def to_tensor(x):
    if type(x) == np.ndarray:
        return torch.from_numpy(x).float()
    elif type(x) == torch.Tensor:
        return x
    else:
        print("Type error. Input should be either numpy array or torch tensor")
    

def to_device(x, GPU=False):
    if GPU:
        return to_cuda(x)
    else:
        return to_tensor(x)
    
    
def to_numpy(x):
    if type(x) == np.ndarray:
        return x
    else:
        try:
            return x.data.numpy()
        except:
            return x.cpu().data.numpy()
        

def cg_solve(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10, x_init=None):
    """
    Goal: Solve Ax=b equivalent to minimizing f(x) = 1/2 x^T A x - x^T b
    Assumption: A is PSD, no damping term is used here (must be damped externally in f_Ax)
    Algorithm template from wikipedia
    Verbose mode works only with numpy
    """
       
    if type(b) == torch.Tensor:
        x = torch.zeros(b.shape[0]) if x_init is None else x_init
        x = x.to(b.device)
        if b.dtype == torch.float16:
            x = x.half()
        r = b - f_Ax(x)
        p = r.clone()
    elif type(b) == np.ndarray:
        x = np.zeros_like(b) if x_init is None else x_init
        r = b - f_Ax(x)
        p = r.copy()
    else:
        print("Type error in cg")

    fmtstr = "%10i %10.3g %10.3g %10.3g"
    titlestr = "%10s %10s %10s %10s"
    if verbose: print(titlestr % ("iter", "residual norm", "soln norm", "obj fn"))

    for i in range(cg_iters):
        if callback is not None:
            callback(x)
        if verbose:
            obj_fn = 0.5*x.dot(f_Ax(x)) - 0.5*b.dot(x)
            norm_x = torch.norm(x) if type(x) == torch.Tensor else np.linalg.norm(x)
            print(fmtstr % (i, r.dot(r), norm_x, obj_fn))

        rdotr = r.dot(r)
        Ap = f_Ax(p)
        alpha = rdotr/(p.dot(Ap))
        x = x + alpha * p
        r = r - alpha * Ap
        newrdotr = r.dot(r)
        beta = newrdotr/rdotr
        p = r + beta * p

        if newrdotr < residual_tol:
            # print("Early CG termination because the residual was small")
            break

    if callback is not None:
        callback(x)
    if verbose: 
        obj_fn = 0.5*x.dot(f_Ax(x)) - 0.5*b.dot(x)
        norm_x = torch.norm(x) if type(x) == torch.Tensor else np.linalg.norm(x)
        print(fmtstr % (i, r.dot(r), norm_x, obj_fn))
    return x


def smooth_vector(vec, window_size=25):
    svec = vec.copy()
    if vec.shape[0] < window_size:
        for i in range(vec.shape[0]):
            svec[i,:] = np.mean(vec[:i, :], axis=0)
    else:   
        for i in range(window_size, vec.shape[0]):
            svec[i,:] = np.mean(vec[i-window_size:i, :], axis=0)
    return svec
    
    
def save_data(agent, train_curve, other_data, save_file, itr=None):
    data = dict(agent=agent,
                losses=train_curve,
                other_data=other_data,
               )
    pickle_file_name = save_file + '.pickle'
    pickle.dump(data, open(pickle_file_name, 'wb'))
    
    plot_file_name = save_file + '.png'
    plt.figure(figsize=(10,6))
    if itr != None:
        plt.plot(smooth_vector(train_curve[:itr]), lw=2)
    else:
        plt.plot(smooth_vector(train_curve), lw=2)
    plt.xlabel('Meta (outer) iterations')
    plt.ylabel('Loss')
    plt.ylim([0.0, 5.0])
    plt.legend(['Train pre-adapt', 'Test pre-adapt', 'Train post-adapt', 'Test post-adapt'], loc=1)
    plt.savefig(plot_file_name, dpi=100)

    
def measure_accuracy(task, model, train=False):
    if train is True:
        x, y = task['x_train'], task['y_train']
    else:
        x, y = task['x_val'], task['y_val']
    y_hat = model.predict(x, return_numpy = True)
    batch_size = y.shape[0]
    predict_label = np.argmax(y_hat, axis=1)
    try:
        correct = np.sum(predict_label == y.cpu().data.numpy())
    except:
        correct = np.sum(predict_label == y.data.numpy())
    return correct * 100.0 / batch_size

    
class DataLog:

    def __init__(self):
        self.log = {}
        self.max_len = 0
        
    def log_exp_args(self, parsed_args):
        args = vars(parsed_args) # makes it a dictionary
        for k in args.keys():
            self.log_kv(k, args[k])

    def log_kv(self, key, value):
        # logs the (key, value) pair
        if key not in self.log:
            self.log[key] = []
        self.log[key].append(value)
        if len(self.log[key]) > self.max_len:
            self.max_len = self.max_len + 1

    def save_log(self, save_path=None):
        save_path = self.log['save_dir'][-1] if save_path is None else save_path
        pickle.dump(self.log, open(save_path+'/log.pickle', 'wb'))
        with open(save_path+'/log.csv', 'w') as csv_file:
            fieldnames = self.log.keys()
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            for row in range(self.max_len):
                row_dict = {}
                for key in self.log.keys():
                    if row < len(self.log[key]):
                        row_dict[key] = self.log[key][row]
                writer.writerow(row_dict)

    def get_current_log(self):
        row_dict = {}
        for key in self.log.keys():
            row_dict[key] = self.log[key][-1]
        return row_dict

    def read_log(self, log_path):
        with open(log_path) as csv_file:
            reader = csv.DictReader(csv_file)
            listr = list(reader)
            keys = reader.fieldnames
            data = {}
            for key in keys:
                data[key] = []
            for row in listr:
                for key in keys:
                    try:
                        data[key].append(eval(row[key]))
                    except:
                        None
        self.log = data

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from   torch.nn import functional as F

class Learner:
    def __init__(self, model, loss_function, inner_lr=1e-3, outer_lr=1e-2, GPU=False, inner_alg='gradient', outer_alg='adam'):
        self.model = model
        self.use_gpu = GPU
        if GPU:
            self.model.cuda()
        assert outer_alg == 'sgd' or 'adam'
        self.inner_opt = torch.optim.SGD(self.model.parameters(), lr=inner_lr)
        if outer_alg == 'adam':
            self.outer_opt = torch.optim.Adam(self.model.parameters(), lr=outer_lr, eps=1e-3)
        else:
            self.outer_opt = torch.optim.SGD(self.model.parameters(), lr=outer_lr)
        self.loss_function = loss_function
        assert inner_alg == 'gradient' # sqp unsupported in this version
        self.inner_alg = inner_alg

    def get_params(self):
        return torch.cat([param.data.view(-1) for param in self.model.parameters()], 0).clone()

    def set_params(self, param_vals):
        offset = 0
        for param in self.model.parameters():
            param.data.copy_(param_vals[offset:offset + param.nelement()].view(param.size()))
            offset += param.nelement()
            
    def set_outer_lr(self, lr):
        for param_group in self.outer_opt.param_groups:
            param_group['lr'] = lr
            
    def set_inner_lr(self, lr):
        for param_group in self.inner_opt.param_groups:
            param_group['lr'] = lr

    def regularization_loss(self, w_0, lam=0.0):
        """
        Add a regularization loss onto the weights
        The proximal term regularizes around the point w_0
        Strength of regularization is lambda
        lambda can either be scalar (type float) or ndarray (numpy.ndarray)
        """
        regu_loss = 0.0
        offset = 0
        regu_lam = lam if type(lam) == float or np.float64 else to_tensor(lam)
        if w_0.dtype == torch.float16:
            try:
                regu_lam = regu_lam.half()
            except:
                regu_lam = np.float16(regu_lam)
        for param in self.model.parameters():
            delta = param.view(-1) - w_0[offset:offset + param.nelement()].view(-1)
            if type(regu_lam) == float or np.float64:
                regu_loss += 0.5 * regu_lam * torch.sum(delta ** 2)
            else:
                # import ipdb; ipdb.set_trace()
                param_lam = regu_lam[offset:offset + param.nelement()].view(-1)
                param_delta = delta * param_lam
                regu_loss += 0.5 * torch.sum(param_delta ** 2)
            offset += param.nelement()
        return regu_loss

    def get_loss(self, x, y, return_numpy=False):
        """
        Assume that x and y are torch tensors -- either in CPU or GPU (controlled externally)
        """
        yhat = self.model.forward(x)
        loss = self.loss_function(yhat, y)
        if return_numpy:
            loss = to_numpy(loss).ravel()[0]
        return loss

    def predict(self, x, return_numpy=False):
        yhat = self.model.forward(to_device(x, self.use_gpu))
        if return_numpy:
            yhat = to_numpy(yhat)
        return yhat

    def learn_on_data(self, x, y, num_steps=10,
                      add_regularization=False,
                      w_0=None, lam=0.0):
        
        assert self.inner_alg == 'gradient' # or 'sqp' or 'adam' # TODO(Aravind): support sqp and adam 
        train_loss = []
        if self.inner_alg == 'gradient':
            for i in range(num_steps):
                self.inner_opt.zero_grad()
                tloss = self.get_loss(x, y)
                loss = tloss + self.regularization_loss(w_0, lam) if add_regularization else tloss
                loss.backward()
                self.inner_opt.step()
                train_loss.append(to_numpy(tloss))

        return train_loss

    def learn_task(self, task, num_steps=10, add_regularization=False, w_0=None, lam=0.0):
        xt, yt = task['x_train'], task['y_train']
        return self.learn_on_data(xt, yt, num_steps, add_regularization, w_0, lam)

    def move_toward_target(self, target, lam=2.0):
        """
        Move slowly towards the target parameter value
        Default value for lam assumes learning rate determined by optimizer
        Useful for implementing Reptile
        """
        # we can implement this with the regularization loss, but regularize around the target point
        # and with specific choice of lam=2.0 to preserve the learning rate of inner_opt
        self.outer_opt.zero_grad()
        loss = self.regularization_loss(target, lam=lam)
        loss.backward()
        self.outer_opt.step()

    def outer_step_with_grad(self, grad, flat_grad=False):
        """
        Given the gradient, step with the outer optimizer using the gradient.
        Assumed that the gradient is a tuple/list of size compatible with model.parameters()
        If flat_grad, then the gradient is a flattened vector
        """
        check = 0
        for p in self.model.parameters():
            check = check + 1 if type(p.grad) == type(None) else check
        if check > 0:
            # initialize the grad fields properly
            dummy_loss = self.regularization_loss(self.get_params())
            dummy_loss.backward()  # this would initialize required variables
        if flat_grad:
            offset = 0
            grad = to_device(grad, self.use_gpu)
            for p in self.model.parameters():
                this_grad = grad[offset:offset + p.nelement()].view(p.size())
                p.grad.copy_(this_grad)
                offset += p.nelement()
        else:
            for i, p in enumerate(self.model.parameters()):
                p.grad = grad[i]
        self.outer_opt.step()

    def matrix_evaluator(self, task, lam, regu_coef=1.0, lam_damping=10.0, x=None, y=None):
        """
        Constructor function that can be given to CG optimizer
        Works for both type(lam) == float and type(lam) == np.ndarray
        """
        if type(lam) == np.ndarray:
            lam = to_device(lam, self.use_gpu)
        def evaluator(v):
            hvp = self.hessian_vector_product(task, v, x=x, y=y)
            Av = (1.0 + regu_coef) * v + hvp / (lam + lam_damping)
            return Av
        return evaluator

    def hessian_vector_product(self, task, vector, params=None, x=None, y=None):
        """
        Performs hessian vector product on the train set in task with the provided vector
        """
        if x is not None and y is not None:
            xt, yt = x, y
        else:
            xt, yt = task['x_train'], task['y_train']
        if params is not None:
            self.set_params(params)
        tloss = self.get_loss(xt, yt)
        grad_ft = torch.autograd.grad(tloss, self.model.parameters(), create_graph=True)
        flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_ft])
        vec = to_device(vector, self.use_gpu)
        h = torch.sum(flat_grad * vec)
        hvp = torch.autograd.grad(h, self.model.parameters())
        hvp_flat = torch.cat([g.contiguous().view(-1) for g in hvp])
        return hvp_flat


def make_fc_network(in_dim=1, out_dim=1, hidden_sizes=(40,40), float16=False):
    non_linearity = nn.ReLU()
    model = nn.Sequential()
    model.add_module('fc_0', nn.Linear(in_dim, hidden_sizes[0]))
    model.add_module('nl_0', non_linearity)
    model.add_module('fc_1', nn.Linear(hidden_sizes[0], hidden_sizes[1]))
    model.add_module('nl_1', non_linearity)
    model.add_module('fc_2', nn.Linear(hidden_sizes[1], out_dim))
    if float16:
        return model.half()
    else:
        return model

    
def make_conv_network(in_channels, out_dim, task='Omniglot', filter_size=32):
    assert task == 'Omniglot' or 'MiniImageNet'
    model = nn.Sequential()
    
    if task == 'MiniImageNet':
        model = model_imagenet_arch(in_channels, out_dim, filter_size)
        
    elif task == 'Omniglot':
        num_filters = 64
        conv_stride = 2
        pool_stride = None
    
        model.add_module('conv1', nn.Conv2d(in_channels=in_channels, out_channels=num_filters,
                                            kernel_size=3, stride=conv_stride, padding=1))
        model.add_module('BN1', nn.BatchNorm2d(num_filters, track_running_stats=False))
        model.add_module('relu1', nn.ReLU())
        model.add_module('conv2', nn.Conv2d(in_channels=num_filters, out_channels=num_filters,
                                            kernel_size=3, stride=conv_stride, padding=1))
        model.add_module('BN2', nn.BatchNorm2d(num_filters, track_running_stats=False))
        model.add_module('relu2', nn.ReLU())
        model.add_module('pad2', nn.ZeroPad2d((0, 1, 0, 1)))
        model.add_module('conv3', nn.Conv2d(in_channels=num_filters, out_channels=num_filters,
                                            kernel_size=3, stride=conv_stride, padding=1))
        model.add_module('BN3', nn.BatchNorm2d(num_filters, track_running_stats=False))
        model.add_module('relu3', nn.ReLU())
        model.add_module('conv4', nn.Conv2d(in_channels=num_filters, out_channels=num_filters,
                                        kernel_size=3, stride=conv_stride, padding=1))
        model.add_module('BN4', nn.BatchNorm2d(num_filters, track_running_stats=False))
        model.add_module('relu4', nn.ReLU())
        model.add_module('flatten', Flatten())
        model.add_module('fc1', nn.Linear(2*2*num_filters, out_dim))
        
    for layer in [model.conv1, model.conv2, model.conv3, model.conv4, model.fc1]:
        torch.nn.init.xavier_uniform_(layer.weight, gain=1.73)
        try:
            torch.nn.init.uniform_(layer.bias, a=0.0, b=0.05)
        except:
            print("Bias layer not detected for layer:", layer)
            pass
    
    return model


class Flatten(nn.Module):
    def forward(self, x):
        x = x.view(x.size()[0], -1)
        return x

    
def model_imagenet_arch(in_channels, out_dim, num_filters=32, batch_norm=True, bias=True):
    raise NotImplementedError

In [4]:
backbone.ConvBlock.maml = True
backbone.SimpleBlock.maml = True
backbone.BottleneckBlock.maml = True
backbone.ResNet.maml = True
combined_network       = backbone.CombinedNetwork(bb, simple_net).cuda()
model = MAML(combined_network, n_support=9, approx=(params.method == 'maml_approx'), problem = "regression").cuda()
optimizer = torch.optim.Adam([{'params': bb.parameters(), 'lr': 0.001}])
for epoch in range(params.stop_epoch):
    model.train_loop_regression(epoch, optimizer, nb_batch_of_batches = 16)

NameError: name 'bb' is not defined