In [1]:
from sklearn.metrics import balanced_accuracy_score
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

In [2]:
df_test = pd.read_json("test_final.json")

In [16]:
df_synthetic_test = pd.read_json('synthetic_test.json')

In [4]:
df_test["lstm_pred"] = torch.load('lstm_pred.pt')
df_test["lstm_pred_label"] = torch.load('lstm_pred_label.pt')
df_test["bert_pred"] = torch.load('bert_pred.pt')
df_test["bert_pred_label"] = torch.load('bert_pred_label.pt')

In [17]:
df_synthetic_test["lstm_pred"] = torch.load('syn_lstm_pred.pt')
df_synthetic_test["lstm_pred_label"] = torch.load('syn_lstm_pred_label.pt')
df_synthetic_test["cnn_pred"] = torch.load('syn_cnn_pred.pt')
df_synthetic_test["cnn_pred_label"] = torch.load('syn_cnn_pred_label.pt')
df_synthetic_test["tansf_pred"] = torch.load('new_trans_pred.pt')
df_synthetic_test["transf_pred_label"] = torch.load('new_trans_pred_label.pt')

In [48]:
df_test["cnn_pred"] = torch.load('cnn_pred.pt')
df_test["cnn_pred_label"] = torch.load('cnn_pred_label.pt')

In [49]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [15]:
checkpoint_path = '/data/sherin/checkpoint_lm/chkpt_lm_bert_wdcy_steplr_recall_best.pt.tar'

In [19]:
class MemNetwork(nn.Module):
    def __init__(self):
        super(MemNetwork, self).__init__()
        
        self.linear = nn.Sequential(
            nn.Linear(768, 768)
        )

    def forward(self, x, y):

        x_input = self.linear(x)
        op = torch.sum(x_input*y, dim=1)
        return op

In [23]:
# use this when the model state is saved
model = MemNetwork().to(device)
PATH = checkpoint_path
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
val_acc_bert = checkpoint['accuracy']

In [32]:
val_acc_bert

0.610660615917686

In [26]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        
        self.linear = nn.Sequential(
            nn.Linear(400, 400)
        )

    def forward(self, x, y):

        x_input = self.linear(x)
        op = torch.sum(x_input*y, dim=1)
        return op

In [27]:
checkpoint_path_lstm = '/data/sherin/checkpoint_lm/chkpt_lm_lstm_wdcy_steplr_recall_best.pt.tar'

In [30]:
# use this when the model state is saved
model_lstm = NeuralNetwork().to(device)
PATH = checkpoint_path_lstm
checkpoint_lstm = torch.load(PATH)
model_lstm.load_state_dict(checkpoint_lstm['model_state_dict'])
epoch = checkpoint_lstm['epoch']
loss = checkpoint_lstm['loss']
val_acc_lstm = checkpoint_lstm['accuracy']

In [31]:
val_acc_lstm

0.6045335981395695

In [51]:
val_acc_cnn = 0.586

In [12]:
val_acc_synth_lstm = 0.8162241887905605
val_acc_synth_cnn = 0.6882743362831858

In [36]:
val_acc_synth_transf = 0.7835763636625038

In [19]:
# computation of optimal tau from Repeat Before Forgetting paper 
# https://github.com/amirieb/RbF/blob/31f2608a3939a6a15d49a33c113065e1a4f73b65/rbf_keras/engine/training.py#L1268

def compute_optimal_tau(kern, avg_test_acc, y_true, y_pred, dist_test,
                        sequence_length_val, x):


    #test_accs = np.array(y_true.ravel()) & np.array(y_pred.ravel())
    #print(test_accs.shape)
    #test_accs = [0.1 if acc < 1. else 0.9 for acc in
    #             test_accs.squeeze().tolist()]
    # test accs now are continuous non-zero values

    test_accs = np.array(y_pred.ravel())
    test_accs = [0.001 if test_acc == 0 else test_acc for test_acc in test_accs]


    if kern == 'Gaussian':
        # throughout training - take average error
        # earlyon maybe a different model is better and maybe at the end a diff
        # model is good - good to capture
        # do this on the validation data

        # epochs1 - k use validation acc as strength of model
        # at teh end use test acc as strength of model
        # s and d normalize - s - 1 - 100  -> 0.01 - 1 d - 0.01 - 1
        # use validation acc instead of test acc - best validation acc
        # best epoch try all functions - both papers on val data
        # then do this for every epoch - val data
        # dont use test data to tune hyperparams
        # gaussian
        
        num = -1.0 * np.sum([np.log(a) for a in test_accs])
        den = np.sum(np.power(x, 2))

    if kern == "Laplacian":
        num = -1.0 * np.sum([np.log(a) for a in test_accs])
        den = np.sum(x)

    if kern == "Linear":
        num = np.sum([(1. - a) for a in test_accs])
        den = np.sum(x)

    if kern == "Cosine":
        num = np.sum([np.arccos(1. * a - 1.) for a in test_accs])
        den = np.pi * np.sum(x)

    if kern == "Quadratic":
        num = np.sum([(1. - a) for a in test_accs])
        den = np.sum(np.power(x, 2))

    if kern == "Secant":
        num = np.sum([np.log(1. / a + np.sqrt(1. / a - 1.)) for a in test_accs])
        den = np.sum(x)
        

    tau = num * 1.0 / den
    return tau, test_accs


In [20]:
def compute_l2_loss(tau, kern, test_accs, x):
    test_accs = np.array(test_accs)
    if kern == 'Gaussian':
        print("computing l2 loss")
        #f_gauss = np.exp(-1 * tau * np.sum(np.power(x, 2)))
        f_gauss = np.array([np.exp(-1 * tau * np.power(x_i, 2))for x_i in x])
        # test_acc b/w 0 and 1
        f_gauss_loss = np.mean(np.power((f_gauss - test_accs), 2))
        return f_gauss_loss

    if kern == "Laplacian":
        #f_lap = np.exp(-1 * tau * np.sum(x))
        f_lap = np.array([np.exp(-1 * tau * x_i) for x_i in x])
        # test_acc b/w 0 and 1
        f_lap_loss = np.mean(np.power((f_lap - test_accs), 2))
        return f_lap_loss

    if kern == "Linear":
        #f_lin = (1 - (1 * tau * np.sum(x)))
        f_lin = np.array([(1 - (tau*x_i)) for x_i in x])
        #f_lin = np.sum(1 - (tau * x))
        f_lin_loss = np.mean(np.power((f_lin - test_accs), 2))
        return f_lin_loss

    if kern == "Cosine":
        #f_cos = 1 / 2 * np.cos(tau * np.sum(x) * np.pi)
        f_cos = np.array([((0.5 * np.cos(tau * x_i * np.pi)) + 0.5) for x_i in x])
        f_cos_loss = np.mean(np.power((f_cos - test_accs), 2))
        return f_cos_loss

    if kern == "Quadratic":
        f_qua = np.array([(1 - (tau * np.power(x, 2))) for x_i in x])
        f_qua_loss = np.mean(np.power((f_qua - test_accs), 2))
        return f_qua_loss

    if kern == "Secant":
        #f_sec = 2 * 1.0 / (np.exp(-1 * tau * np.sum(np.power(x, 2))) + np.exp(
        #    1 * tau * np.sum(np.power(x, 2))))
        f_sec = np.array([(2*1.0 /(np.exp(-1 * tau * np.power(x_i, 2)) + np.exp(1 * tau * np.power(x, 2))) )for x_i in x])
        f_sec_loss = np.mean(np.power((f_sec - test_accs), 2))
        return f_sec_loss


In [21]:
def compute_loss_forgetting_functions(forgetting_function, avg_test_acc,
                                      dist_test, sequence_length_val, test_accs, val_acc):

    # difficulty = seq len; time elapsed since last review = dist; strength =
    # average accuracy.
    # exp(-seq_len*intervening_tokens/avg_test_acc)

    if forgetting_function == 'diff_dist_strength':
        x = [((s * d * 1.0) / ((val_acc+np.finfo(float).eps) * 100 * 100)) for s, d in
             zip(sequence_length_val, dist_test)]
        x = np.array(x)
        f_diff_dist_strength = np.exp(-x)
        f_diff_dist_strength_loss = np.mean(np.power
                                            ((f_diff_dist_strength - test_accs), 2))
        return f_diff_dist_strength_loss

    # exp(-seq_len*intervening_tokens)
    elif forgetting_function == 'diff_dist':
        x = [((s * d * 1.0) / (100 * 100)) for s, d in
             zip(sequence_length_val, dist_test)]
        x = np.array(x)
        f_diff_dist = np.exp(-x)
        f_diff_dist_loss = np.mean(np.power
                                   ((f_diff_dist - test_accs), 2))
        return f_diff_dist_loss

    # exp(-seq_len/avg_test_acc)
    elif forgetting_function == 'diff_strength':
        x = [((s * 1.0) / ((val_acc+np.finfo(float).eps) * 100 * 100)) for s, d in
             zip(sequence_length_val, dist_test)]
        x = np.array(x)
        f_diff_strength = np.exp(-x)
        f_diff_strength_loss = np.mean(np.power
                                       ((f_diff_strength - test_accs), 2))
        return f_diff_strength_loss

In [22]:
def kernel_matching(y_true, y_pred, dist_test, sequence_length_val,
                    y_pred_binary_pos_samples, x, val_acc):
    kernels = ['Gaussian', 'Laplacian', 'Linear', 'Cosine', 'Quadratic',
               'Secant']
    avg_test_acc = balanced_accuracy_score(y_true, y_pred_binary_pos_samples)
    print("computing optimal tau")
    kern_loss = []
    tau_kernels = []
    exp_forgetting_function_loss = []
    # compute x - seq_len*dist

    for kern in kernels:
        print("Kernel type is {}".format(kern))

        tau, test_accs = compute_optimal_tau(kern, avg_test_acc, y_true, y_pred,
                                             dist_test, sequence_length_val, x)
        tau_kernels.append(tau)
        print("optimal value of tau is {}".format(tau))
        l2_loss = compute_l2_loss(tau, kern, test_accs, x)
        print("L2 loss for kernel {} is {}".format(kern, l2_loss))
        kern_loss.append(l2_loss)
 
     # debug only
    # compute l2 loss for functions from Reddy et al paper
    exp_forgetting_functions = ['diff_dist_strength', 'diff_dist',
                                'diff_strength']
    
    #test_accs = np.array(y_true.ravel()) & np.array(y_pred.ravel())
    test_accs = np.array(y_pred.ravel())
    for exp_forgetting_function in exp_forgetting_functions:
        exp_forgetting_l2_loss = compute_loss_forgetting_functions(
            exp_forgetting_function, avg_test_acc, dist_test, sequence_length_val,
            test_accs, val_acc)
        exp_forgetting_function_loss.append(exp_forgetting_l2_loss)
        print("L2 loss for forgetting function {} is {}".format(exp_forgetting_function, exp_forgetting_l2_loss))


    # find the least loss
    min_index = kern_loss.index(min(kern_loss))
    print("The best kernel is {}".format(kernels[min_index]))
    print("the value of the loss is {}".format(min(kern_loss)))

    min_index_exp_forgetting_function = \
        exp_forgetting_function_loss.index(min(exp_forgetting_function_loss))
    print("The best forgetting function is {}".format(exp_forgetting_functions[min_index_exp_forgetting_function]))

    print("the value of the loss is {}".format(min(exp_forgetting_function_loss)))

    return kernels[min_index], tau_kernels[min_index]

In [56]:
# keep only the positive samples - 16901 samples
df_test_pos = df_test[df_test['label'] == 1]

In [57]:
def get_intervening_tokens(c, q, l):
    try:
        int_tok = 512 - c.split(" ").index(q) - 1
    except:
        int_tok = -1
    
    return int_tok

In [58]:
df_test_pos["int_tok"] = df_test_pos.apply(lambda x: get_intervening_tokens(x['context'], x['query'], x['context_length']), axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [59]:
# extract the relevant columns and convert them into numpy array to reuse the synthetic code
target_val_pos_samples = df_test_pos['label'].to_numpy()

bert_y_pred_pos_samples = df_test_pos['bert_pred'].to_numpy()
lstm_y_pred_pos_samples = df_test_pos['lstm_pred'].to_numpy()
cnn_y_pred_pos_samples = df_test_pos['cnn_pred'].to_numpy()


dist_pos_samples = df_test_pos['int_tok'].to_numpy()
seq_len_test_pos_samples = df_test_pos['context_length'].to_numpy()

bert_y_pred_binary_pos_samples = df_test_pos['bert_pred_label'].to_numpy()
lstm_y_pred_binary_pos_samples = df_test_pos['lstm_pred_label'].to_numpy()
cnn_y_pred_binary_pos_samples = df_test_pos['cnn_pred_label'].to_numpy()


In [60]:
# compute x
# difficulty = seq len; time elapsed since last review = dist; strength =
# average accuracy.
# normalize s and d by dividing by 100
x_bert = [((s * d * 1.0) / ((val_acc_bert+np.finfo(float).eps) * 100 * 100)) for s, d in
         zip(seq_len_test_pos_samples, dist_pos_samples)]

x_lstm = [((s * d * 1.0) / ((val_acc_lstm+np.finfo(float).eps) * 100 * 100)) for s, d in
         zip(seq_len_test_pos_samples, dist_pos_samples)]
    
x_cnn = [((s * d * 1.0) / ((val_acc_cnn+np.finfo(float).eps) * 100 * 100)) for s, d in
         zip(seq_len_test_pos_samples, dist_pos_samples)]

In [42]:
# finding the best kernel to match the memory retention test
kernel, tau = kernel_matching(target_val_pos_samples, bert_y_pred_pos_samples,
                              dist_pos_samples, seq_len_test_pos_samples,
                              bert_y_pred_binary_pos_samples, x_bert, val_acc_bert)

computing optimal tau
Kernel type is Gaussian
optimal value of tau is 0.03264930054717812
computing l2 loss
L2 loss for kernel Gaussian is 0.6590702119650492
Kernel type is Laplacian
optimal value of tau is 0.5395132164006848
L2 loss for kernel Laplacian is 0.671934146396757
Kernel type is Linear
optimal value of tau is 0.022616831608982358
L2 loss for kernel Linear is 0.2531979153063546
Kernel type is Cosine
optimal value of tau is 0.04703716542737344
L2 loss for kernel Cosine is 0.4402867721655349
Kernel type is Quadratic
optimal value of tau is 0.0013686851594719262
L2 loss for kernel Quadratic is 0.3224170532340216
Kernel type is Secant
optimal value of tau is 0.541486383729324
L2 loss for kernel Secant is 0.6818845042600029
L2 loss for forgetting function diff_dist_strength is 0.6766405352510189
L2 loss for forgetting function diff_dist is 0.6738540179824547
L2 loss for forgetting function diff_strength is 0.2846763645712736
The best kernel is Linear
the value of the loss is 0.253

In [43]:
# finding the best kernel to match the memory retention test
kernel, tau = kernel_matching(target_val_pos_samples, lstm_y_pred_pos_samples,
                              dist_pos_samples, seq_len_test_pos_samples,
                              lstm_y_pred_binary_pos_samples, x_lstm, val_acc_lstm)

computing optimal tau
Kernel type is Gaussian
optimal value of tau is 0.006909447270277666
computing l2 loss
L2 loss for kernel Gaussian is 0.20768180854135618
Kernel type is Laplacian
optimal value of tau is 0.11533230768341526
L2 loss for kernel Laplacian is 0.18787674732125534
Kernel type is Linear
optimal value of tau is 0.040974960119359374
L2 loss for kernel Linear is 0.19452047453523882
Kernel type is Cosine
optimal value of tau is 0.05183361278558985
L2 loss for kernel Cosine is 0.21335240180492043
Kernel type is Quadratic
optimal value of tau is 0.0024547703244054276
L2 loss for kernel Quadratic is 0.3773132203972371
Kernel type is Secant
optimal value of tau is 0.13492258951837094
L2 loss for kernel Secant is 0.3171974889785323
L2 loss for forgetting function diff_dist_strength is 0.3009614907462594
L2 loss for forgetting function diff_dist is 0.2982823852944072
L2 loss for forgetting function diff_strength is 0.40519973640850815
The best kernel is Laplacian
the value of the 

In [None]:
len(df_test['query'].unique())

In [61]:
# finding the best kernel to match the memory retention test
kernel, tau = kernel_matching(target_val_pos_samples, cnn_y_pred_pos_samples,
                              dist_pos_samples, seq_len_test_pos_samples,
                              cnn_y_pred_binary_pos_samples, x_cnn, val_acc_cnn)

computing optimal tau
Kernel type is Gaussian
optimal value of tau is 0.024075917725875876
computing l2 loss
L2 loss for kernel Gaussian is 0.24260048350911195
Kernel type is Laplacian
optimal value of tau is 0.414584777856588
L2 loss for kernel Laplacian is 0.2479561775327922
Kernel type is Linear
optimal value of tau is 0.04851545321240991
L2 loss for kernel Linear is 0.2558018648411966
Kernel type is Cosine
optimal value of tau is 0.05702814756959793
L2 loss for kernel Cosine is 0.23302157758113093
Kernel type is Quadratic
optimal value of tau is 0.002817407011454751
L2 loss for kernel Quadratic is 0.5197170494377559
Kernel type is Secant
optimal value of tau is 0.42257829558560117
L2 loss for kernel Secant is 0.26058524508401376
L2 loss for forgetting function diff_dist_strength is 0.256449556657345
L2 loss for forgetting function diff_dist is 0.25319223200607516
L2 loss for forgetting function diff_strength is 0.6021174682773007
The best kernel is Cosine
the value of the loss is 0

In [23]:
ip_sequence_test = np.load('test_context_query.npy', allow_pickle=True)

In [24]:
int_tok = np.zeros(len(ip_sequence_test))
for i in range(len(ip_sequence_test)):
    a = np.where(ip_sequence_test[i][:-1] == ip_sequence_test[i][-1])
    if a[0].size == 0:
        int_tok[i] = -1 
        #print(len(a))
    else:
        int_tok[i] = len(ip_sequence_test[i][:-1]) - a[0][0] - 1

In [25]:
df_synthetic_test["int_tok"] = int_tok

In [26]:
df_synthetic_test_pos = df_synthetic_test[df_synthetic_test['target_val'] == 1]

In [31]:
df_synthetic_test_pos.head()

Unnamed: 0,index,seq_len,seq,rep_token_first_pos,query_token,target_val,lstm_pred,lstm_pred_label,cnn_pred,cnn_pred_label,tansf_pred,transf_pred_label,int_tok
84812,84812,45,[17 30 76 12 43 65 5 71 89 62 83 88 32 99 68 ...,33,98,1,1.0,1,0.335489,0,0.998704,1.0,11.0
163418,163418,84,[93 12 79 87 90 94 15 0 9 95 11 70 18 23 52 ...,22,85,1,1.0,1,0.984755,1,0.978039,1.0,61.0
154870,154870,80,[82 65 51 88 98 66 5 97 32 53 91 6 55 15 29 ...,54,20,1,0.9947405,1,0.974092,1,0.998642,1.0,25.0
128894,128894,67,[57 9 72 52 31 46 97 37 7 96 44 74 69 81 91 ...,9,96,1,7.574954e-18,0,0.936459,1,0.122071,0.0,57.0
165684,165684,85,[78 87 80 65 43 39 19 27 74 11 94 62 60 25 73 ...,9,11,1,1.0,1,0.695454,1,0.588875,1.0,75.0


In [33]:
# extract the relevant columns and convert them into numpy array to reuse the synthetic code
target_val_pos_samples = df_synthetic_test_pos['target_val'].to_numpy()

lstm_y_pred_pos_samples = df_synthetic_test_pos['lstm_pred'].to_numpy()
cnn_y_pred_pos_samples = df_synthetic_test_pos['cnn_pred'].to_numpy()
tansf_y_pred_pos_samples = df_synthetic_test_pos['tansf_pred'].to_numpy()

dist_pos_samples = df_synthetic_test_pos['int_tok'].to_numpy()
seq_len_test_pos_samples = df_synthetic_test_pos['seq_len'].to_numpy()

lstm_y_pred_binary_pos_samples = df_synthetic_test_pos['lstm_pred_label'].to_numpy()
cnn_y_pred_binary_pos_samples = df_synthetic_test_pos['cnn_pred_label'].to_numpy()
transf_y_pred_binary_pos_samples = df_synthetic_test_pos['transf_pred_label'].to_numpy()

In [34]:
# compute x
# difficulty = seq len; time elapsed since last review = dist; strength =
# average accuracy.
# normalize s and d by dividing by 100
#x_bert = [((s * d * 1.0) / ((val_acc_bert+np.finfo(float).eps) * 100 * 100)) for s, d in
#         zip(seq_len_test_pos_samples, dist_pos_samples)]

x_lstm_synth = [((s * d * 1.0) / ((val_acc_synth_lstm+np.finfo(float).eps) * 100 * 100)) for s, d in
         zip(seq_len_test_pos_samples, dist_pos_samples)]
    
x_cnn_synth = [((s * d * 1.0) / ((val_acc_synth_cnn+np.finfo(float).eps) * 100 * 100)) for s, d in
         zip(seq_len_test_pos_samples, dist_pos_samples)]

NameError: name 'val_acc_synth_lstm' is not defined

In [37]:
x_transf_synth = [((s * d * 1.0) / ((val_acc_synth_transf+np.finfo(float).eps) * 100 * 100)) for s, d in
         zip(seq_len_test_pos_samples, dist_pos_samples)]

In [21]:
# finding the best kernel to match the memory retention test
kernel, tau = kernel_matching(target_val_pos_samples, lstm_y_pred_pos_samples,
                              dist_pos_samples, seq_len_test_pos_samples,
                              lstm_y_pred_binary_pos_samples, x_lstm_synth, val_acc_synth_lstm)



computing optimal tau
Kernel type is Gaussian
optimal value of tau is 23.845543254239377
computing l2 loss
L2 loss for kernel Gaussian is 0.2575309079412811
Kernel type is Laplacian
optimal value of tau is 11.419156181330422
L2 loss for kernel Laplacian is 0.3563615088471858
Kernel type is Linear
optimal value of tau is 0.9655074939135232
L2 loss for kernel Linear is 0.1439239766526381
Kernel type is Cosine
optimal value of tau is 2.8856885101830296
L2 loss for kernel Cosine is 0.2198103798381932
Kernel type is Quadratic
optimal value of tau is 2.0161779331863743
L2 loss for kernel Quadratic is 0.22045687635941513
Kernel type is Secant
optimal value of tau is 11.589805745770215
L2 loss for kernel Secant is 0.32773486344682806
L2 loss for forgetting function diff_dist_strength is 0.1337452500329711
L2 loss for forgetting function diff_dist is 0.13480035937442192
L2 loss for forgetting function diff_strength is 0.18242147923892027
The best kernel is Linear
the value of the loss is 0.1439

In [22]:
# finding the best kernel to match the memory retention test
kernel, tau = kernel_matching(target_val_pos_samples, cnn_y_pred_pos_samples,
                              dist_pos_samples, seq_len_test_pos_samples,
                              cnn_y_pred_binary_pos_samples, x_cnn_synth, val_acc_synth_cnn)

computing optimal tau
Kernel type is Gaussian
optimal value of tau is 4.725299249063713
computing l2 loss
L2 loss for kernel Gaussian is 0.18777452761474392
Kernel type is Laplacian
optimal value of tau is 2.6835147928518195
L2 loss for kernel Laplacian is 0.16341907656093382
Kernel type is Linear
optimal value of tau is 1.2957197168992913
L2 loss for kernel Linear is 0.20218873979145927
Kernel type is Cosine
optimal value of tau is 2.528415213098354
L2 loss for kernel Cosine is 0.19375322844416693
Kernel type is Quadratic
optimal value of tau is 2.281583623675353
L2 loss for kernel Quadratic is 0.44167978694767923
Kernel type is Secant
optimal value of tau is 3.7186251360364784
L2 loss for kernel Secant is 0.3128883297610826
L2 loss for forgetting function diff_dist_strength is 0.13595279890471793
L2 loss for forgetting function diff_dist is 0.14146703354839776
L2 loss for forgetting function diff_strength is 0.1989300329202723
The best kernel is Laplacian
the value of the loss is 0.1

In [41]:
# finding the best kernel to match the memory retention test
kernel, tau = kernel_matching(target_val_pos_samples, tansf_y_pred_pos_samples,
                              dist_pos_samples, seq_len_test_pos_samples,
                              transf_y_pred_binary_pos_samples, x_transf_synth, val_acc_synth_transf)

computing optimal tau
Kernel type is Gaussian
optimal value of tau is 5.03393164004504
computing l2 loss
L2 loss for kernel Gaussian is 0.20707940090133176
Kernel type is Laplacian
optimal value of tau is 2.5110897725647376
L2 loss for kernel Laplacian is 0.18869507452724185
Kernel type is Linear
optimal value of tau is 1.1416449916109557
L2 loss for kernel Linear is 0.18402130330445773
Kernel type is Cosine
optimal value of tau is 2.760149001567948
L2 loss for kernel Cosine is 0.22296297873913878
Kernel type is Quadratic
optimal value of tau is 2.288632970337656
L2 loss for kernel Quadratic is 0.33977207070623794
Kernel type is Secant
optimal value of tau is 3.4954278831778263
L2 loss for kernel Secant is 0.27275157273665296
L2 loss for forgetting function diff_dist_strength is 0.13626779929915653
L2 loss for forgetting function diff_dist is 0.13259756722115076
L2 loss for forgetting function diff_strength is 0.15394165537219165
The best kernel is Linear
the value of the loss is 0.184