In [1]:
import math
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from scipy.special import softmax

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("always", ConvergenceWarning)

In [2]:
from maml.datasets.miniimagenet import MiniimagenetMetaDataset
from maml.models.gated_conv_net import ImpRegConvModel
from maml.models.conv_embedding_model import RegConvEmbeddingModel
from maml.logistic_regression_utils import logistic_regression_grad_with_respect_to_w, logistic_regression_hessian_with_respect_to_w, logistic_regression_mixed_derivatives_with_respect_to_w_then_to_X
from maml.algorithm import ImpRMAML_inner_algorithm

In [3]:
modulation_mat_rank = 32
num_channels = 32
dataset = MiniimagenetMetaDataset(
    root='data',
    img_side_len=84,
    num_classes_per_batch=5,
    num_samples_per_class=5,
    num_total_batches=5,
    num_val_samples=5,
    meta_batch_size=2,
    split='train',
    num_workers=4,
    device='cuda')
model = ImpRegConvModel(
        input_channels=dataset.input_size[0],
        output_size=dataset.output_size,
        num_channels=num_channels,
        modulation_mat_rank=modulation_mat_rank,
        img_side_len=dataset.input_size[1],
        use_max_pool=False,
        verbose=False)
model.to('cuda')

MiniImagenet train


ImpRegConvModel(
  (features): Sequential(
    (layer1_conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer1_bn): BatchNorm2d(32, eps=1e-05, momentum=0.001, affine=False, track_running_stats=True)
    (layer1_relu): ReLU(inplace=True)
    (layer2_conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer2_bn): BatchNorm2d(64, eps=1e-05, momentum=0.001, affine=False, track_running_stats=True)
    (layer2_relu): ReLU(inplace=True)
    (layer3_conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer3_bn): BatchNorm2d(128, eps=1e-05, momentum=0.001, affine=False, track_running_stats=True)
    (layer3_relu): ReLU(inplace=True)
    (layer4_conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer4_bn): BatchNorm2d(256, eps=1e-05, momentum=0.001, affine=False, track_running_stats=True)
    (layer4_relu): ReLU(inplace=True)
  )
)

In [4]:
modulation_mat_size = (modulation_mat_rank, num_channels*8)
embedding_hidden_size = 256
embedding_num_layers = 2
num_conv_embedding_layer = 4

embedding_model = RegConvEmbeddingModel(
             input_size=np.prod(dataset.input_size),
             output_size=dataset.output_size,
             modulation_mat_size=modulation_mat_size,
             hidden_size=embedding_hidden_size,
             num_layers=embedding_num_layers,
             convolutional=True,
             num_conv=num_conv_embedding_layer,
             num_channels=num_channels,
             rnn_aggregation=False,
             linear_before_rnn=False,
             embedding_pooling='max',
             batch_norm=True,
             avgpool_after_conv=True,
             num_sample_embedding=0,
             sample_embedding_file='abcdf',
             img_size=dataset.input_size,
             verbose=False,
             original_conv=False,
             modulation_mat_spec_norm = 10000)
embedding_model.to('cuda')

In [5]:
slow_lr = 0.001
optimizer_specs = \
        [{'params': model.parameters(), 'lr': slow_lr},
         {'params': embedding_model.parameters(), 'lr': slow_lr}]

In [6]:
l2_lambda = 2

In [7]:
loss_func = torch.nn.CrossEntropyLoss()

In [8]:
algorithm = ImpRMAML_inner_algorithm(
    inner_loss_func=loss_func,
    fast_lr=1,
    first_order=False,
    num_updates=5,
    inner_loop_grad_clip=1,
    inner_loop_soft_clip_slope=0,
    device='cuda',
    is_classification=True,
    is_momentum=False,
    gamma_momentum=0,
    l2_lambda=2)

Momentum :  False 0


In [9]:
for train_task_batch, test_task_batch in iter(dataset):
    break

In [24]:
batch_size = len(train_task_batch)

In [10]:
train_task, test_task = train_task_batch[0], test_task_batch[0]

In [11]:
modulation = embedding_model(train_task, return_task_embedding=False)

In [12]:
adapted_params, features_train, train_hessian, train_mixed_partials, train_measurements_trajectory, info_dict = \
        algorithm.inner_loop_adapt(train_task, model, modulation)

In [25]:
features_test = model(batch=test_task.x, modulation=modulation)
test_pred_after_adapt = F.linear(features_test, weight=adapted_params)
test_loss_after_adapt = loss_func(test_pred_after_adapt, test_task.y)
# sum_test_loss_after_adapt += test_loss_after_adapt

# test_measurements_after_adapt_over_batch['loss'].append(test_loss_after_adapt.item())
test_loss_after_adapt /= batch_size
# if self._algorithm.is_classification:
#     test_measurements_after_adapt_over_batch['accu'].append(
#         accuracy(test_pred_after_adapt, test_task.y))


In [55]:
X_test = features_test.detach().cpu().numpy()
y_test = (test_task.y).cpu().numpy()
w = adapted_params.detach().cpu().numpy()
test_grad_w = logistic_regression_grad_with_respect_to_w(X_test, y_test, w)

train_hessian_inverse = np.linalg.inv(train_hessian)
train_hessian_inv_test_grad2 = np.matmul(train_hessian_inverse, test_grad_w)

In [56]:
train_hessian_inv_test_grad = np.linalg.solve(train_hessian, test_grad_w)

In [57]:
np.allclose(train_hessian_inv_test_grad, train_hessian_inv_test_grad2)

True

In [66]:
import timeit
start_time = timeit.default_timer()
# code you want to evaluate
for i in range(1000):
    train_hessian_inv_test_grad = np.linalg.solve(train_hessian, test_grad_w)
elapsed = timeit.default_timer() - start_time

In [67]:
print(elapsed)

0.3316672621294856


In [68]:
import timeit
start_time = timeit.default_timer()
# code you want to evaluate
for i in range(1000):
    train_hessian_inverse = np.linalg.inv(train_hessian)
    train_hessian_inv_test_grad2 = np.matmul(train_hessian_inverse, test_grad_w)
elapsed = timeit.default_timer() - start_time

In [69]:
print(elapsed)

0.8142814019229263


In [32]:
test_grad_features_train = - np.matmul(train_mixed_partials.T, train_hessian_inv_test_grad)

In [35]:
test_grad_features_train

array([[-5.86689944e-06],
       [ 2.28183341e-04],
       [ 1.69732800e-04],
       [ 1.56727825e-04],
       [ 2.06517664e-04],
       [-8.55849017e-04],
       [-2.68292633e-04],
       [-3.00482910e-04],
       [-5.79355809e-04],
       [ 1.11498035e-05],
       [ 7.94717999e-05],
       [ 7.46266562e-04],
       [-4.23774640e-04],
       [ 3.46629313e-04],
       [-4.74869609e-04],
       [-3.52933504e-04],
       [-8.91328415e-05],
       [-4.85059940e-04],
       [ 2.11538605e-04],
       [-3.06287788e-04],
       [-9.41656689e-04],
       [ 4.10770683e-04],
       [ 1.53474383e-04],
       [-5.01453948e-05],
       [-6.30378068e-04],
       [-6.19105683e-04],
       [-5.84916811e-04],
       [ 4.31313114e-05],
       [-5.07634198e-04],
       [ 2.61401210e-04],
       [-1.41763186e-05],
       [ 3.56409741e-04],
       [-2.98130823e-05],
       [ 5.20891965e-04],
       [ 1.02040137e-04],
       [ 5.70311611e-05],
       [ 3.37088523e-04],
       [ 4.03662845e-05],
       [-3.5

In [36]:
test_grad_features_train = test_grad_features_test.reshape(features_train.shape)

In [39]:
test_grad_features_train.shape

(25, 33)

In [44]:
features_train.backward(gradient=(torch.tensor(test_grad_features_train, device='cuda') / batch_size),
                       retain_graph=True,
                       create_graph=False)

In [49]:
model.features.layer1_conv.weight.grad

tensor([[[[-1.9992e-03, -3.3771e-03, -4.5705e-03],
          [-4.3941e-03, -5.3211e-03, -6.2224e-03],
          [-6.3419e-03, -6.1599e-03, -5.2049e-03]],

         [[-1.4139e-03, -2.6787e-03, -4.0073e-03],
          [-3.3102e-03, -4.1965e-03, -5.2476e-03],
          [-5.0431e-03, -4.6816e-03, -4.0324e-03]],

         [[ 1.2603e-03, -4.9566e-04, -1.8361e-03],
          [-4.4502e-04, -1.8278e-03, -2.9421e-03],
          [-2.0906e-03, -2.2726e-03, -1.7270e-03]]],


        [[[ 1.3743e-04,  1.9117e-03,  3.7519e-04],
          [-2.2992e-04,  1.6246e-04,  8.1934e-04],
          [-6.0558e-05, -2.6684e-04,  6.1135e-04]],

         [[-1.1093e-03,  7.0195e-04, -5.7002e-04],
          [-1.4990e-03, -9.9679e-04, -1.8907e-04],
          [-1.1694e-03, -1.2289e-03, -1.3209e-04]],

         [[-4.0867e-04,  1.3512e-03,  9.2424e-05],
          [-6.0050e-04, -5.8705e-05,  7.3351e-04],
          [-4.1673e-04, -3.7217e-04,  4.5850e-04]]],


        [[[-5.4086e-04, -7.5547e-04,  3.7638e-04],
          [ 8.4

In [50]:
test_loss_after_adapt.backward(retain_graph=False, create_graph=False)

In [51]:
model.features.layer1_conv.weight.grad

tensor([[[[-4.6484e-03, -1.3440e-02,  2.8648e-03],
          [ 4.6435e-03, -3.9491e-03, -6.6009e-03],
          [ 1.5197e-02, -1.8413e-03, -2.6866e-03]],

         [[ 2.8611e-02,  1.5717e-02,  2.0388e-02],
          [ 3.6953e-02,  2.4397e-02,  1.2396e-02],
          [ 4.4305e-02,  2.3166e-02,  1.5496e-02]],

         [[ 2.0726e-02,  1.7660e-03,  5.0022e-03],
          [ 2.0838e-02,  5.9331e-03, -7.1890e-03],
          [ 2.4863e-02,  3.7700e-03, -4.5385e-03]]],


        [[[-4.4765e-02, -3.7990e-02, -1.7611e-02],
          [-5.1342e-02, -4.0510e-02, -3.4405e-02],
          [-5.0090e-02, -3.7025e-02, -2.8815e-02]],

         [[-2.3626e-02, -2.1135e-02, -8.5455e-03],
          [-2.4660e-02, -1.8697e-02, -1.7827e-02],
          [-2.6762e-02, -1.7995e-02, -1.2547e-02]],

         [[-1.3327e-02, -1.1032e-02,  9.8651e-04],
          [-1.5912e-02, -1.0534e-02, -1.2719e-02],
          [-1.7798e-02, -1.0051e-02, -8.9536e-03]]],


        [[[-3.2838e-02, -3.1895e-02, -1.4920e-02],
          [-4.0