# SST-5: TEXT Classification + BERT + Ax

## Librairies

In [2]:
import os
import sys

In [5]:
import io
import re
import pickle
from timeit import default_timer as timer

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [6]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
SEED = 42

## Global variables

In [1]:
RESULTS_PATH = '~/Results/Ax_results/ESN' # path of your result folder
CACHE_DIR = '~/Data/huggignface/'         # path of your  folder

PARAMS_FILE = 'sst-5_params.pkl'
RESULTS_FILE = 'sst-5_results.pkl'

## Dataset

In [8]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(dataset_name, split, cache_dir):
    
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    # dataset = dataset.rename_column('label', 'labels')
    
    def update_label(sample):
        sample["labels"] = int(sample["label"] * 10 // 2)
        return sample
    
    dataset = dataset.map(update_label, batched=False)
    
    dataset = dataset.map(lambda e: tokenizer(e['sentence'], truncation=True, padding=False), batched=True) # here 'text' is called 'sentence'
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def add_lengths(sample):
        sample["lengths"] = sum(sample["input_ids"] != 0)
        return sample
    
    dataset = dataset.map(add_lengths, batched=False)
    
    return dataset

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

full_train_dataset = load_and_enrich_dataset('sst', split='train', cache_dir=CACHE_DIR).sort("lengths") # toriving/sst5
train_val_datasets = full_train_dataset.train_test_split(train_size=0.8, shuffle=True)
train_dataset = train_val_datasets['train'].sort("lengths")
val_dataset = train_val_datasets['test'].sort("lengths")

test_dataset = load_and_enrich_dataset('sst', split='test', cache_dir=CACHE_DIR).sort("lengths")

dataset_d = {
    'full_train': full_train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

No config specified, defaulting to: sst/default
Reusing dataset sst (/raid/home/jeremiec/huggingface_datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


HBox(children=(FloatProgress(value=0.0, max=8544.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8544.0), HTML(value='')))

No config specified, defaulting to: sst/default
Reusing dataset sst (/raid/home/jeremiec/huggingface_datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)





HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2210.0), HTML(value='')))




In [10]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'input_ids', 'label', 'labels', 'lengths', 'sentence', 'token_type_ids', 'tokens', 'tree'],
     num_rows: 8544
 }),
 'train': Dataset({
     features: ['attention_mask', 'input_ids', 'label', 'labels', 'lengths', 'sentence', 'token_type_ids', 'tokens', 'tree'],
     num_rows: 6835
 }),
 'val': Dataset({
     features: ['attention_mask', 'input_ids', 'label', 'labels', 'lengths', 'sentence', 'token_type_ids', 'tokens', 'tree'],
     num_rows: 1709
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'label', 'labels', 'lengths', 'sentence', 'token_type_ids', 'tokens', 'tree'],
     num_rows: 2210
 })}

## Optimization

In [12]:
def fitness(leaking_rate, 
            spectral_radius, 
            input_scaling, 
            bias_scaling, 
            alpha, 
            reservoir_dim, 
            dataset_d, 
            dataloader_d, 
            seed_l=[1991, 420, 666, 1979, 7], # 5 seeds
            return_test_acc=False):
    
    acc_l = []
    time_l = []
    
    for seed in seed_l:
    
        # parameters
        esn_params = {
                    'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
                    'distribution' : 'gaussian',              # uniform, gaussian
                    'input_dim' : 768,                        # dim of encoding!
                    'reservoir_dim' : reservoir_dim,
                    'bias_scaling' : bias_scaling,
                    'sparsity' : 0.99,
                    'spectral_radius' : spectral_radius,
                    'leaking_rate': leaking_rate,
                    'activation_function' : 'tanh',
                    'input_scaling' : input_scaling,
                    'mean' : 0.0,
                    'std' : 1.0,
                    'learning_algo' : None,
                    'criterion' : None,
                    'optimizer' : None,
                    'merging_strategy' : 'mean',
                    'lexicon' : None,
                    'bidirectional' : False,
                    'device' : device,
                    'seed' : seed
                     }

        # model
        ESN = esn.EchoStateNetwork(**esn_params)

        ESN.learning_algo = la.RidgeRegression(alpha = alpha)# , mode='normalize')

        ESN = ESN.to(device)

        # warm up (new)
        nb_sentences = 3
        for i in range(nb_sentences): 

            sentence = dataset_d["train"].select([i])
            dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                         batch_size=1, 
                                                         collate_fn=DataCollatorWithPadding(tokenizer))  

            for sentence in dataloader_tmp:
                ESN.warm_up(sentence)
        
        # predict
        if return_test_acc:
            t0 = timer()
            LOSS = ESN.fit(dataloader_d["full_train"])
            t1 = timer()
            time_l.append(t1 - t0)
            acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item()
        else:
            LOSS = ESN.fit(dataloader_d["train"])
            acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()

        acc_l.append(acc)
        
        # clean objects
        del ESN.learning_algo
        del ESN.criterion
        del ESN.merging_strategy
        del ESN
        torch.cuda.empty_cache()
    
    if return_test_acc:
        return np.mean(acc_l), np.std(acc_l), np.mean(time_l), np.std(time_l)
    else:
        return np.mean(acc_l)

In [13]:
# %%time

# fitness(leaking_rate=0.2, spectral_radius=1.1, input_scaling=0.8, bias_scaling=1.0, alpha=10, reservoir_dim=500, dataset_d=dataset_d, dataloader_d=dataloader_d)

In [14]:
def wrapped_fitness(d, return_test_acc=False):
    
    return fitness(leaking_rate=d['leaking_rate'],
                   spectral_radius=d['spectral_radius'],
                   input_scaling=d['input_scaling'],
                   bias_scaling=d['bias_scaling'],
                   alpha=d['alpha'],
                   reservoir_dim=d['reservoir_dim'], # will be in the loop
                   dataset_d=dataset_d,
                   dataloader_d=dataloader_d,
                   return_test_acc=return_test_acc)

In [None]:
best_params_d = {}

for res_dim in [500, 1000, 2000, 3000, 5000]:

    best_parameters, best_values, experiment, model = optimize(
            parameters=[
              {
                "name": "leaking_rate",
                "value_type": "float",
                "type": "range",
                "bounds": [0.0, 0.999],
              },
              {
                "name": "spectral_radius",
                "value_type": "float",
                "type": "range",
                "bounds": [0.2, 1.7],
              },
              {
                "name": "input_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "bias_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "alpha",
                "value_type": "float",
                "type": "range",
                "log_scale": True,
                "bounds": [1e-3, 1e3],
              },
              {
                "name": "reservoir_dim",
                "value_type": "int",
                "type": "fixed",
                "value": res_dim,
              }
            ],
            # Booth function
            evaluation_function = wrapped_fitness,
            minimize = False,
            objective_name = 'val_accuracy',
            total_trials = 30
        )
    
    # results
    best_params_d[res_dim] = {}
    best_params_d[res_dim]['best_parameters'] = best_parameters
    best_params_d[res_dim]['best_values'] = best_values
    best_params_d[res_dim]['experiment'] = experiment
    # best_params_d[res_dim]['model'] = model

[INFO 05-20 15:49:54] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 6 trials, GPEI for subsequent trials]). Iterations after 6 will take longer to generate due to  model-fitting.
[INFO 05-20 15:49:54] ax.service.managed_loop: Started full optimization with 30 steps.
[INFO 05-20 15:49:54] ax.service.managed_loop: Running optimization trial 1...
[INFO 05-20 15:51:36] ax.service.managed_loop: Running optimization trial 2...
[INFO 05-20 15:53:16] ax.service.managed_loop: Running optimization trial 3...
[INFO 05-20 15:54:55] ax.service.managed_loop: Running optimization trial 4...
[INFO 05-20 15:56:34] ax.service.managed_loop: Running optimization trial 5...


## Results

In [None]:
# best parameters

with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'wb') as fh:
    pickle.dump(best_params_d, fh)

In [None]:
# # load results
# with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
#     best_params_d = pickle.load(fh)

In [None]:
best_params_d

In [None]:
# results

results_d = {}

for res_dim in [500, 1000, 2000, 3000, 5000]:
    
    best_parameters = best_params_d[res_dim]['best_parameters']
    acc, acc_std, time, time_std = wrapped_fitness(best_parameters, return_test_acc=True)
    results_d[res_dim] = acc, acc_std, time, time_std
    print("Experiment finished.")

In [None]:
results_d

In [None]:
with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
    pickle.dump(results_d, fh)

In [24]:
# # load results
# with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
#     results_d = pickle.load(fh)

In [25]:
results_d

{500: {'best_parameters': {'leaking_rate': 0.999,
   'spectral_radius': 0.27674338131663206,
   'input_scaling': 1.8210240728005092,
   'bias_scaling': 2.9753017004325115,
   'alpha': 999.9999999995317,
   'reservoir_dim': 500},
  'best_values': ({'val_accuracy': 45.76865555148736},
   {'val_accuracy': {'val_accuracy': 0.0004233825293638899}}),
  'experiment': SimpleExperiment(None)},
 1000: {'best_parameters': {'leaking_rate': 0.28903720188652193,
   'spectral_radius': 0.2000000000506923,
   'input_scaling': 1.3797263557182773,
   'bias_scaling': 2.7036067121229217,
   'alpha': 999.999998190359,
   'reservoir_dim': 1000},
  'best_values': ({'val_accuracy': 45.58933193297141},
   {'val_accuracy': {'val_accuracy': 0.019043477688240135}}),
  'experiment': SimpleExperiment(None)},
 2000: {'best_parameters': {'leaking_rate': 0.4445926069873061,
   'spectral_radius': 0.2,
   'input_scaling': 2.236177918889967,
   'bias_scaling': 2.213347976203966,
   'alpha': 999.9999999938606,
   'reservoi

In [27]:
# Didn't work that well...
# Perhaps the label creation form [0,1] (continuous) to {0, 1, 2, 3, 4} (discrete) is problematic