# SST-2
# Simple Baselines using ``mean`` and ``last`` pooling

## Librairies

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath("../.."))

In [2]:
import io
import re
import pickle
from timeit import default_timer as timer

from tqdm.notebook import tqdm

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [3]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
SEED = 42

## Global variables

In [7]:
CACHE_DIR = '/raid/home/jeremiec/huggingface_datasets' # put your path here
RESULTS_FILE = '/raid/home/jeremiec/Ax_results/Baselines_v2/sst-2_results_.pkl' # put your path here

## Dataset

In [8]:
# download dataset

# full train, mini train, and val sets
raw_datasets = load_dataset('glue', 'sst2', cache_dir=CACHE_DIR)
raw_datasets = raw_datasets.rename_column('sentence', 'text')

full_train_dataset = raw_datasets['train']
train_dataset = full_train_dataset.train_test_split(train_size=0.3, shuffle=True)['train']

val_dataset = raw_datasets['validation']

# special test set
test_dataset = load_dataset('gpt3mix/sst2', split='test', cache_dir=CACHE_DIR)

def clean(example):
    example['text'] = example['text'].replace('-LRB-', '(').replace('-RRB-', ')').replace(r'\/', r'/')
    example['label'] = np.abs(example['label'] - 1) # revert labels of test set
    return example

test_dataset = test_dataset.map(clean)

# create dataset_d
dataset_d = {}

dataset_d = {
    'full_train': full_train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

Reusing dataset glue (/raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached split indices for dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-553d8e7a475556fb.arrow and /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-54ad2d89c117ec97.arrow
Using custom data configuration default
Reusing dataset sst2 (/raid/home/jeremiec/huggingface_datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/gpt3mix___sst2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa/cache-a9d835d5a47d8794.arrow


In [9]:
dataset_d

{'full_train': Dataset({
     features: ['idx', 'label', 'text'],
     num_rows: 67349
 }),
 'train': Dataset({
     features: ['idx', 'label', 'text'],
     num_rows: 20204
 }),
 'val': Dataset({
     features: ['idx', 'label', 'text'],
     num_rows: 872
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 1821
 })}

In [10]:
# tokenize

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=False, truncation=True, return_length=True)

for k, v in dataset_d.items():
    tmp = v.map(tokenize_function, batched=True)
    tmp = tmp.rename_column('length', 'lengths')
    tmp = tmp.sort("lengths")
    tmp = tmp.rename_column('label', 'labels')
    tmp.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths'])
    
    dataset_d[k] = tmp

# dataloaders

dataloader_d = {}

for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7b5404094c7fe582.arrow
Loading cached sorted indices for dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-5794d9eb3ec77166.arrow
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b46598ca8b91b3b7.arrow
Loading cached sorted indices for dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-70ef2b934e019961.arrow
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-627f897c0ccc8bc0.arrow
Loading cached sorted indices for dataset at /r

In [11]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 67349
 }),
 'train': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 20204
 }),
 'val': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 872
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 1821
 })}

## Optimization

In [12]:
baseline_params = {
                'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
                'distribution' : 'uniform',               # uniform, gaussian
                'input_dim' : 768,                        # dim of encoding!
                'reservoir_dim' : 0,      # not used
                'bias_scaling' : 0.0,     # not used
                'sparsity' : 0.0,         # not used
                'spectral_radius' : None, 
                'leaking_rate': 0.5,      # not used
                'activation_function' : 'tanh',
                'input_scaling' : 0.1,
                'mean' : 0.0,
                'std' : 1.0,
                'learning_algo' : None,
                'criterion' : None,
                'optimizer' : None,
                'merging_strategy' : None,
                'lexicon' : None,
                'bidirectional' : False,
                'mode' : 'no_layer',     # simple baseline
                'device' : device,
                'seed' : 4
                }

In [13]:
results_d = {}

for pooling_strategy in tqdm(['last', 'mean']):
    
    results_d[pooling_strategy] = {}
    
    for alpha in tqdm([0.1, 1.0, 10.0, 100.0]):
        
        results_d[pooling_strategy][alpha] = []
        
        # model
        baseline_params['merging_strategy'] = pooling_strategy
        baseline_params['mode'] = 'no_layer'
        print(baseline_params)
        ESN = esn.EchoStateNetwork(**baseline_params)
        ESN.learning_algo = la.RidgeRegression(alpha=alpha)
        ESN = ESN.to(device)

        # train
        t0 = timer()
        LOSS = ESN.fit(dataloader_d["full_train"])                       # full train set
        t1 = timer()
        acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item() # full test set

        # results
        results_d[pooling_strategy][alpha].append([acc, t1 - t0])
        
        # clean objects
        del ESN.learning_algo
        del ESN.criterion
        del ESN.merging_strategy
        del ESN
        torch.cuda.empty_cache()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'last', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased
{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'last', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased
{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'last', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased
{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'last', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


  0%|          | 0/4 [00:00<?, ?it/s]

{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased
{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased
{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased
{'embedding_weights': 'bert-base-uncased', 'distribution': 'uniform', 'input_dim': 768, 'reservoir_dim': 0, 'bias_scaling': 0.0, 'sparsity': 0.0, 'spectral_radius': None, 'leaking_rate': 0.5, 'activation_function': 'tanh', 'input_scaling': 0.1, 'mean': 0.0, 'std': 1.0, 'learning_algo': None, 'criterion': None, 'optimizer': None, 'merging_strategy': 'mean', 'lexicon': None, 'bidirectional': False, 'mode': 'no_layer', 'device': device(type='cuda'), 'seed': 4}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


In [14]:
results_d

{'last': {0.1: [[80.66996002197266, 47.55896031111479]],
  1.0: [[80.61505126953125, 47.44986550882459]],
  10.0: [[80.66996002197266, 48.260296736843884]],
  100.0: [[80.61505126953125, 49.056894733570516]]},
 'mean': {0.1: [[85.6123046875, 35.88784305751324]],
  1.0: [[85.6123046875, 35.942259199917316]],
  10.0: [[85.72212982177734, 36.11868996359408]],
  100.0: [[85.6123046875, 36.40100293233991]]}}

## Results

In [15]:
# save results

with open(RESULTS_FILE, 'wb') as fh:
    pickle.dump(results_d, fh)

In [16]:
# # load results
# with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'rb') as fh:
#     results_d = pickle.load(fh)

In [17]:
# results_d