# TREC-6

## Librairies

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath("../.."))

In [2]:
import io
import re
import pickle
from timeit import default_timer as timer

from tqdm.notebook import tqdm

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [3]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
SEED = 42

## Global variables

In [6]:
pooling_strategy = 'mean' # 'mean' or 'last'

In [7]:
RESULTS_PATH = '/raid/home/jeremiec/Ax_results/ESN_v3' # put your path here
CACHE_DIR = '/raid/home/jeremiec/huggingface_datasets' # put your path here
RESULTS_FILE = f'trec-6_{pooling_strategy}.pkl'

In [8]:
os.path.join(RESULTS_PATH, RESULTS_FILE)

'/raid/home/jeremiec/Ax_results/ESN_v3/trec-6_mean.pkl'

## Dataset

In [9]:
# rename correct column as 'labels': depends on the dataset you load

def tokenize(sample):
    """Tokenize sample"""
    
    sample = tokenizer(sample['text'], truncation=True, padding=False, return_length=True)
    
    return sample
    
def load_and_enrich_dataset(dataset_name, split, cache_dir):
    """
    Load dataset from the datasets library of HuggingFace.
    Tokenize and add length.
    """
    
    # Load dataset
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    # Rename label column for tokenization purposes (use 'label-fine' for fine-grained labels)
    dataset = dataset.rename_column('label-coarse', 'labels')
    
    # Tokenize data
    dataset = dataset.map(tokenize, batched=True)
    dataset = dataset.rename_column('length', 'lengths')
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths'])
    
    return dataset

In [10]:
# tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# train set
full_train_dataset = load_and_enrich_dataset('trec', split='train', cache_dir=CACHE_DIR).sort("lengths")

train_val_datasets = full_train_dataset.train_test_split(train_size=0.8, shuffle=True)
train_dataset = train_val_datasets['train'].sort("lengths")
val_dataset = train_val_datasets['test'].sort("lengths")

# test set
test_dataset = load_and_enrich_dataset('trec', split='test', cache_dir=CACHE_DIR).sort("lengths")

dataset_d = {
    'full_train': full_train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

# dataloaders
dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

Using custom data configuration default
Reusing dataset trec (/raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9)
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9/cache-848d2c8f39949357.arrow
Loading cached sorted indices for dataset at /raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9/cache-b41d7661aa7925e9.arrow
Loading cached split indices for dataset at /raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9/cache-9441de5c9d7aee43.arrow and /raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9/cache-6ef7be09f24a2150.arrow
Loading cached sorted indices for dataset at /raid/home/jeremiec/hu

In [11]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 5452
 }),
 'train': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 4361
 }),
 'val': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 1091
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 500
 })}

## Grid search and results

In [12]:
ESN_PARAMS = {
            'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
            'distribution' : 'uniform',               # uniform, gaussian
            'input_dim' : 768,                        # dim of encoding!
            'reservoir_dim' : None,
            'bias_scaling' : 0.0,
            'sparsity' : 0.9,
            'spectral_radius' : None,
            'leaking_rate': 0.5,
            'activation_function' : 'tanh',
            'input_scaling' : 0.1,
            'mean' : 0.0,
            'std' : 1.0,
            'learning_algo' : None,
            'criterion' : None,
            'optimizer' : None,
            'merging_strategy' : pooling_strategy,
            'lexicon' : None,
            'bidirectional' : False, # False
            'mode' : None,
            'device' : device,
            'seed' : None
             }

In [13]:
def warm_up(ESN, dataset_d):
    """Warm up the ESN."""
    
    nb_sentences = 10
    
    for i in range(nb_sentences): 

        sentence = dataset_d["train"].select([i])
        dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                     batch_size=1, 
                                                     collate_fn=DataCollatorWithPadding(tokenizer))  

        for sentence in dataloader_tmp:
            
            ESN.warm_up(sentence)

In [14]:
results_d = {}

for reservoir_dim in tqdm([500, 1000, 3000, 5000]):
    print('\nreservoir_dim', reservoir_dim)
    
    results_d[reservoir_dim] = {}
    
    for spectral_radius in tqdm([0.5, 1.0, 1.5]):
        print('\nspectral_radius', spectral_radius)
                
        for alpha in tqdm([0.1, 1.0, 10.0, 100.0]):
            print('\nalpha', alpha)
            
            results_d[reservoir_dim][(spectral_radius, alpha)] = {}
            
            print('\nNEW TMP DICT')
            tmp = {}
            
            tmp['esn'] = {}
            tmp['esn']['training_time'] = []
            tmp['esn']['val_acc'] = []
            tmp['esn']['test_acc'] = []
            
            tmp['linear_layer'] = {}
            tmp['linear_layer']['training_time'] = []
            tmp['linear_layer']['val_acc'] = []
            tmp['linear_layer']['test_acc'] = []

            for seed in tqdm([888, 42, 19937456, 7, 1979]):

                for mode in tqdm(['esn', 'linear_layer']):
                    print('\nmode', mode)
                    
                    # model
                    ESN_PARAMS['reservoir_dim'] = reservoir_dim
                    ESN_PARAMS['spectral_radius'] = spectral_radius
                    ESN_PARAMS['mode'] = mode
                    ESN_PARAMS['seed'] = seed
                    print('\nPARAMS', ESN_PARAMS)
                    ESN = esn.EchoStateNetwork(**ESN_PARAMS)
                    ESN.learning_algo = la.RidgeRegression(alpha = alpha)
                    ESN = ESN.to(device)

                    # warm up
                    if mode == 'esn':
                        warm_up(ESN, dataset_d)

                    # train
                    LOSS = ESN.fit(dataloader_d["train"])                                 # train set
                    val_acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()   # val set
                    
                    t0 = timer()
                    LOSS = ESN.fit(dataloader_d["full_train"])                            # full train set
                    t1 = timer()
                    test_acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item() # test set
                    
                    tmp[mode]['training_time'].append(t1-t0)
                    tmp[mode]['val_acc'].append(val_acc)
                    tmp[mode]['test_acc'].append(test_acc)
                    
                    # clean objects
                    del ESN.learning_algo
                    del ESN.criterion
                    del ESN.merging_strategy
                    del ESN
                    torch.cuda.empty_cache()
            
            
            for mode in ['esn', 'linear_layer']:
                
                print(f'\nMODE {mode}: 3 tmp lists...')
                print(tmp[mode]['training_time'])
                print(tmp[mode]['val_acc'])
                print(tmp[mode]['test_acc'])
                
                time_mean = np.mean(tmp[mode]['training_time'])
                time_std = np.std(tmp[mode]['training_time'])
                
                val_acc_mean = np.mean(tmp[mode]['val_acc'])
                val_acc_std = np.std(tmp[mode]['val_acc'])
                
                test_acc_mean = np.mean(tmp[mode]['test_acc'])
                test_acc_std = np.std(tmp[mode]['test_acc'])
                
                
                results_d[reservoir_dim][(spectral_radius, alpha)][mode] = [time_mean, 
                                                                            time_std, 
                                                                            val_acc_mean, 
                                                                            val_acc_std, 
                                                                            test_acc_mean, 
                                                                            test_acc_std]

  0%|          | 0/4 [00:00<?, ?it/s]


reservoir_dim 500


  0%|          | 0/3 [00:00<?, ?it/s]


spectral_radius 0.5


  0%|          | 0/4 [00:00<?, ?it/s]


alpha 0.1

NEW TMP DICT


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 888}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

mode linear_layer

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'linear_layer', 'device': device(type='cuda'), 'seed': 888}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 42}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

mode linear_layer

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'linear_layer', 'device': device(type='cuda'), 'seed': 42}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 19937456}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

mode linear_layer

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'linear_layer', 'device': device(type='cuda'), 'seed': 19937456}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 7}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

mode linear_layer

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'linear_layer', 'device': device(type='cuda'), 'seed': 7}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 1979}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

mode linear_layer

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'linear_layer', 'device': device(type='cuda'), 'seed': 1979}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

MODE esn: 3 tmp lists...
[3.7147483257576823, 3.8625135645270348, 4.062860160134733, 3.760483148507774, 3.6797985695302486]
[81.11824035644531, 84.87625885009766, 82.951416015625, 82.30980682373047, 83.31805419921875]
[89.80000305175781, 89.20000457763672, 89.60000610351562, 89.80000305175781, 89.00000762939453]

MODE linear_layer: 3 tmp lists...
[3.8009906467050314, 3.8913081604987383, 3.9171852627769113, 3.806072784587741, 3.7440578509122133]
[80.84326171875, 83.40971374511719, 83.22639465332031, 82.67644500732422, 83.50137329101562]
[90.00000762939453, 89.00000762939453, 89.80000305175781, 89.20000457763672, 89.4000015258789]

alpha 1.0

NEW TMP DICT


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 888}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased

mode linear_layer

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'linear_layer', 'device': device(type='cuda'), 'seed': 888}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


  0%|          | 0/2 [00:00<?, ?it/s]


mode esn

PARAMS {'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 500, 'bias_scaling': 0.0, 'sparsity': 0.9, 'spectral_radius': 0.5, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'esn', 'device': device(type='cuda'), 'seed': 42}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


KeyboardInterrupt: 

In [None]:
results_d

In [None]:
# save parameters

with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
    pickle.dump(results_d, fh)

In [19]:
# save parameters

with open(os.path.join(RESULTS_PATH, 'trec-50_last.pkl'), 'rb') as fh:
    results = pickle.load(fh)

In [20]:
results

{500: {(0.5,
   0.1): {'esn': [3.90293134264648,
    0.2148812073389988,
    65.5178726196289,
    0.9514892871913357,
    68.4800018310547,
    0.41182455353211866], 'linear_layer': [3.834154766984284,
    0.03700454625030309,
    70.22914581298828,
    0.6695515395448655,
    72.44000396728515,
    0.5851508469413329]},
  (0.5,
   1.0): {'esn': [3.944436385296285,
    0.17941181349184146,
    65.51787414550782,
    0.7764561993686643,
    68.40000457763672,
    0.4560692332417033], 'linear_layer': [3.9227834928780796,
    0.15969742283216215,
    70.17414855957031,
    0.6096529482823224,
    72.24000396728516,
    0.697422815791013]},
  (0.5,
   10.0): {'esn': [3.9884867038577796,
    0.2030304448474599,
    65.02291412353516,
    0.727753401225769,
    68.08000335693359,
    0.6881856311313498], 'linear_layer': [3.902862263657153,
    0.09349263506407664,
    69.0742416381836,
    0.7161149444980832,
    70.4800033569336,
    0.8059794543686607]},
  (0.5,
   100.0): {'esn': [4.0763