# SST-2: TEXT Classification + BERT + Ax

## Librairies

In [None]:
# !pip install transformers==4.8.2
# !pip install datasets==1.7.0
# !pip install ax-platform==0.1.20
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [2]:
import os
import sys

In [5]:
import io
import re
import pickle
from timeit import default_timer as timer

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [6]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
SEED = 42

## Global variables

In [8]:
RESULTS_PATH = '~/Results/Ax_results/ESN' # path of your result folder
CACHE_DIR = '~/Data/huggignface/'         # path of your  folder

PARAMS_FILE = 'sst-2_params.pkl'
RESULTS_FILE = 'sst-2_results.pkl' # add bidirectional if necessary

## Dataset

In [22]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(*dataset_name, split, cache_dir):
    
    dataset = load_dataset(*dataset_name, split=split, cache_dir=CACHE_DIR)
    
    def clean(example):
        example['text'] = example['text'].replace('-LRB-', '(').replace('-RRB-', ')').replace(r'\/', r'/')
        return example
        
    if 'sentence' in dataset.column_names:
        dataset = dataset.rename_column('sentence', 'text')
        
    dataset = dataset.map(clean)
    dataset = dataset.rename_column('label', 'labels')
    
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=False), batched=True)
    
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def add_lengths(sample):
        sample["lengths"] = sum(sample["input_ids"] != 0)
        return sample
    
    dataset = dataset.map(add_lengths, batched=False)
    
    return dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_dataset = load_and_enrich_dataset('glue', 'sst2', split='train', cache_dir=CACHE_DIR).sort("lengths")
val_dataset = load_and_enrich_dataset('glue', 'sst2', split='validation', cache_dir=CACHE_DIR).sort("lengths")
test_dataset = load_and_enrich_dataset('gpt3mix/sst2', split='test', cache_dir=CACHE_DIR).sort("lengths")
def revert(example):
    example['labels'] = np.abs(example['labels'] - 1) # revert labels of test set
    return example
test_dataset = test_dataset.map(revert) # revert labels of test set


dataset_d = {
    'full_train': train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

In [24]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 67349
 }),
 'train': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 67349
 }),
 'val': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 872
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 1821
 })}

## Optimization

In [11]:
def fitness(leaking_rate, 
            spectral_radius, 
            input_scaling, 
            bias_scaling, 
            alpha, 
            reservoir_dim, 
            dataset_d, 
            dataloader_d, 
            seed_l=[1991, 420, 666, 1979, 7], # 5 seeds
            return_test_acc=False):
    
    acc_l = []
    time_l = []
    
    for seed in seed_l:
    
        # parameters
        esn_params = {
                    'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
                    'distribution' : 'uniform',               # uniform, gaussian
                    'input_dim' : 768,                        # dim of encoding!
                    'reservoir_dim' : reservoir_dim,
                    'bias_scaling' : bias_scaling,
                    'sparsity' : 0.99,
                    'spectral_radius' : spectral_radius,
                    'leaking_rate': leaking_rate,
                    'activation_function' : 'tanh',
                    'input_scaling' : input_scaling,
                    'mean' : 0.0,
                    'std' : 1.0,
                    'learning_algo' : None,
                    'criterion' : None,
                    'optimizer' : None,
                    'merging_strategy' : 'mean',
                    'lexicon' : None,
                    'bidirectional' : True, # True
                    'device' : device,
                    'seed' : seed
                     }

        # model
        ESN = esn.EchoStateNetwork(**esn_params)

        ESN.learning_algo = la.RidgeRegression(alpha = alpha)# , mode='normalize')

        ESN = ESN.to(device)

        # warm up (new)
        nb_sentences = 3
        for i in range(nb_sentences): 

            sentence = dataset_d["train"].select([i])
            dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                         batch_size=1, 
                                                         collate_fn=DataCollatorWithPadding(tokenizer))  

            for sentence in dataloader_tmp:
                ESN.warm_up(sentence)
        
        # predict
        if return_test_acc:
            t0 = timer()
            LOSS = ESN.fit(dataloader_d["full_train"])
            t1 = timer()
            time_l.append(t1 - t0)
            acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item()
        else:
            LOSS = ESN.fit(dataloader_d["train"])
            acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()

        acc_l.append(acc)
        
        # clean objects
        del ESN.learning_algo
        del ESN.criterion
        del ESN.merging_strategy
        del ESN
        torch.cuda.empty_cache()
    
    if return_test_acc:
        return np.mean(acc_l), np.std(acc_l), np.mean(time_l), np.std(time_l)
    else:
        return np.mean(acc_l)

In [12]:
# %%time

# fitness(leaking_rate=0.2, spectral_radius=1.1, input_scaling=0.8, bias_scaling=1.0, alpha=10, reservoir_dim=500, dataset_d=dataset_d, dataloader_d=dataloader_d)

In [13]:
def wrapped_fitness(d, return_test_acc=False):
    
    return fitness(leaking_rate=d['leaking_rate'],
                   spectral_radius=d['spectral_radius'],
                   input_scaling=d['input_scaling'],
                   bias_scaling=d['bias_scaling'],
                   alpha=d['alpha'],
                   reservoir_dim=d['reservoir_dim'], # will be in the loop
                   dataset_d=dataset_d,
                   dataloader_d=dataloader_d,
                   return_test_acc=return_test_acc)

In [14]:
# *** WARNING *** DO NO EXECUTE NEXT CELLS IF BIDIRECTIONAL MODE (OPTIM ALREADY DONE)

In [None]:
best_params_d = {}

for res_dim in [500, 1000, 3000, 5000]:

    best_parameters, best_values, experiment, model = optimize(
            parameters=[
              {
                "name": "leaking_rate",
                "value_type": "float",
                "type": "range",
                "bounds": [0.0, 0.999],
              },
              {
                "name": "spectral_radius",
                "value_type": "float",
                "type": "range",
                "bounds": [0.2, 1.7],
              },
              {
                "name": "input_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "bias_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "alpha",
                "value_type": "float",
                "type": "range",
                "log_scale": True,
                "bounds": [1e-3, 1e3],
              },
              {
                "name": "reservoir_dim",
                "value_type": "int",
                "type": "fixed",
                "value": res_dim,
              }
            ],
            # Booth function
            evaluation_function = wrapped_fitness,
            minimize = False,
            objective_name = 'val_accuracy',
            total_trials = 40
        )
    
    # results
    best_params_d[res_dim] = {}
    best_params_d[res_dim]['best_parameters'] = best_parameters
    best_params_d[res_dim]['best_values'] = best_values
    best_params_d[res_dim]['experiment'] = experiment
    # best_params_d[res_dim]['model'] = model

[INFO 06-13 21:02:37] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 6 trials, GPEI for subsequent trials]). Iterations after 6 will take longer to generate due to  model-fitting.
[INFO 06-13 21:02:37] ax.service.managed_loop: Started full optimization with 40 steps.
[INFO 06-13 21:02:37] ax.service.managed_loop: Running optimization trial 1...
[INFO 06-13 21:12:51] ax.service.managed_loop: Running optimization trial 2...
[INFO 06-13 21:23:06] ax.service.managed_loop: Running optimization trial 3...
[INFO 06-13 21:33:20] ax.service.managed_loop: Running optimization trial 4...
[INFO 06-13 21:43:32] ax.service.managed_loop: Running optimization trial 5...
[INFO 06-13 21:53:43] ax.service.managed_loop: Running optimization trial 6...
[INFO 06-13 22:03:52] ax.service.managed_loop: Running optimization trial 7...
[INFO 06-13 22:13:59] ax.service.managed_loop: Running optimization trial 8...


## Results

In [25]:
# best parameters

with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'wb') as fh:
    pickle.dump(best_params_d, fh)

In [14]:
# # load results
#
# with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
#     best_params_d = pickle.load(fh)

In [15]:
best_params_d

{500: {'best_parameters': {'leaking_rate': 0.47695718066691795,
   'spectral_radius': 0.9806841635334129,
   'input_scaling': 0.39442745732230466,
   'bias_scaling': 2.9913935509428167,
   'alpha': 187.24443736915234,
   'reservoir_dim': 500},
  'best_values': ({'val_accuracy': 85.42499653051203},
   {'val_accuracy': {'val_accuracy': 0.0002917334367039788}}),
  'experiment': SimpleExperiment(None)},
 1000: {'best_parameters': {'leaking_rate': 0.999,
   'spectral_radius': 1.4065436383676726,
   'input_scaling': 3.0,
   'bias_scaling': 1.204768167389038,
   'alpha': 368.1204122844207,
   'reservoir_dim': 1000},
  'best_values': ({'val_accuracy': 85.91037417355504},
   {'val_accuracy': {'val_accuracy': 0.002372900082448123}}),
  'experiment': SimpleExperiment(None)},
 3000: {'best_parameters': {'leaking_rate': 0.21235683692622584,
   'spectral_radius': 0.974603168832477,
   'input_scaling': 0.45175238088928504,
   'bias_scaling': 1.4961239017801158,
   'alpha': 163.07251835798152,
   'res

In [16]:
# results

results_d = {}

for res_dim in [500, 1000, 3000, 5000]:
    
    best_parameters = best_params_d[res_dim]['best_parameters']
    acc, acc_std, time, time_std = wrapped_fitness(best_parameters, return_test_acc=True)
    results_d[res_dim] = acc, acc_std, time, time_std
    print("Experiment finished.")

Experiment finished.
Experiment finished.
Experiment finished.
Experiment finished.


In [17]:
results_d

{500: (84.15156555175781,
  0.3782338275805986,
  136.6237069355324,
  0.9166563086117541),
 1000: (84.84349365234375,
  0.3331292053011683,
  136.48797907084227,
  0.5450728294595891),
 3000: (85.48050537109376,
  0.13270701340725286,
  149.14078205926344,
  0.4957212838617135),
 5000: (85.80999450683593,
  0.4059257223968821,
  205.13034001868218,
  1.0576259955703051)}

In [18]:
# with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
#     pickle.dump(results_d, fh)

In [17]:
# load results
with open(os.path.join(RESULTS_PATH, 'sst-2_bidirectional_results.pkl'), 'rb') as fh:
    results = pickle.load(fh)

In [18]:
results

{500: (84.15156555175781,
  0.3782338275805986,
  136.6237069355324,
  0.9166563086117541),
 1000: (84.84349365234375,
  0.3331292053011683,
  136.48797907084227,
  0.5450728294595891),
 3000: (85.48050537109376,
  0.13270701340725286,
  149.14078205926344,
  0.4957212838617135),
 5000: (85.80999450683593,
  0.4059257223968821,
  205.13034001868218,
  1.0576259955703051)}