# AGnews-4: TEXT Classification + BERT + Ax

## Librairies

In [2]:
import os
import sys

In [5]:
import io
import re
import pickle
from timeit import default_timer as timer

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [6]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
SEED = 42

## Global variables

In [1]:
RESULTS_PATH = '~/Results/Ax_results/ESN' # path of your result folder
CACHE_DIR = '~/Data/huggignface/'         # path of your  folder

PARAMS_FILE = 'AGnews-4_params.pkl'
RESULTS_FILE = 'AGnews-4_results.pkl'

## Datasets

In [9]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(dataset_name, split, cache_dir):
    
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=False), batched=True)
    dataset = dataset.rename_column('label', 'labels') # 'label-fine'
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def add_lengths(sample):
        sample["lengths"] = sum(sample["input_ids"] != 0)
        return sample
    
    dataset = dataset.map(add_lengths, batched=False)
    
    return dataset

In [10]:
CACHE_DIR = '/raid/home/jeremiec/huggingface_datasets'

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

full_train_dataset = load_and_enrich_dataset('ag_news', split='train', cache_dir=CACHE_DIR).sort("lengths") # toriving/sst5
train_val_datasets = full_train_dataset.train_test_split(train_size=0.8, shuffle=True)
train_dataset = train_val_datasets['train'].sort("lengths")
val_dataset = train_val_datasets['test'].sort("lengths")

test_dataset = load_and_enrich_dataset('ag_news', split='test', cache_dir=CACHE_DIR).sort("lengths")

dataset_d = {
    'full_train': full_train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1780.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1227.0, style=ProgressStyle(description…

Using custom data configuration default
Reusing dataset ag_news (/raid/home/jeremiec/huggingface_datasets/ag_news/default/0.0.0/0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)





HBox(children=(FloatProgress(value=0.0, max=120.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=120000.0), HTML(value='')))




Using custom data configuration default
Reusing dataset ag_news (/raid/home/jeremiec/huggingface_datasets/ag_news/default/0.0.0/0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7600.0), HTML(value='')))




In [11]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 120000
 }),
 'train': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 96000
 }),
 'val': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 24000
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 7600
 })}

## Optimization

In [13]:
def fitness(leaking_rate, 
            spectral_radius, 
            input_scaling, 
            bias_scaling, 
            alpha, 
            reservoir_dim, 
            dataset_d, 
            dataloader_d, 
            seed_l=[1991, 420, 666, 1979, 7], # 5 seeds
            return_test_acc=False):
    
    acc_l = []
    time_l = []
    
    for seed in seed_l:
    
        # parameters
        esn_params = {
                    'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
                    'distribution' : 'uniform',               # uniform, gaussian
                    'input_dim' : 768,                        # dim of encoding!
                    'reservoir_dim' : reservoir_dim,
                    'bias_scaling' : bias_scaling,
                    'sparsity' : 0.99,
                    'spectral_radius' : spectral_radius,
                    'leaking_rate': leaking_rate,
                    'activation_function' : 'tanh',
                    'input_scaling' : input_scaling,
                    'mean' : 0.0,
                    'std' : 1.0,
                    'learning_algo' : None,
                    'criterion' : None,
                    'optimizer' : None,
                    'merging_strategy' : 'mean',
                    'lexicon' : None,
                    'bidirectional' : True, # False
                    'device' : device,
                    'seed' : seed
                     }

        # model
        ESN = esn.EchoStateNetwork(**esn_params)

        ESN.learning_algo = la.RidgeRegression(alpha = alpha)# , mode='normalize')

        ESN = ESN.to(device)

        # warm up (new)
        nb_sentences = 3
        for i in range(nb_sentences): 

            sentence = dataset_d["train"].select([i])
            dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                         batch_size=1, 
                                                         collate_fn=DataCollatorWithPadding(tokenizer))  

            for sentence in dataloader_tmp:
                ESN.warm_up(sentence)
        
        # predict
        if return_test_acc:
            t0 = timer()
            LOSS = ESN.fit(dataloader_d["full_train"])
            t1 = timer()
            time_l.append(t1 - t0)
            acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item()
        else:
            LOSS = ESN.fit(dataloader_d["train"])
            acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()

        acc_l.append(acc)
        
        # clean objects
        del ESN.learning_algo
        del ESN.criterion
        del ESN.merging_strategy
        del ESN
        torch.cuda.empty_cache()
    
    if return_test_acc:
        return np.mean(acc_l), np.std(acc_l), np.mean(time_l), np.std(time_l)
    else:
        return np.mean(acc_l)

In [14]:
# %%time

# fitness(leaking_rate=0.2, spectral_radius=1.1, input_scaling=0.8, bias_scaling=1.0, alpha=10, reservoir_dim=500, dataset_d=dataset_d, dataloader_d=dataloader_d)

In [15]:
def wrapped_fitness(d, return_test_acc=False):
    
    return fitness(leaking_rate=d['leaking_rate'],
                   spectral_radius=d['spectral_radius'],
                   input_scaling=d['input_scaling'],
                   bias_scaling=d['bias_scaling'],
                   alpha=d['alpha'],
                   reservoir_dim=d['reservoir_dim'], # will be in the loop
                   dataset_d=dataset_d,
                   dataloader_d=dataloader_d,
                   return_test_acc=return_test_acc)

In [None]:
# *** WARNING *** DO NO EXECUTE NEXT CELLS IF BIDIRECTIONAL MODE (OPTIM ALREADY DONE)

In [None]:
best_params_d = {}

for res_dim in [500, 1000, 3000, 5000]:

    best_parameters, best_values, experiment, model = optimize(
            parameters=[
              {
                "name": "leaking_rate",
                "value_type": "float",
                "type": "range",
                "bounds": [0.0, 0.999],
              },
              {
                "name": "spectral_radius",
                "value_type": "float",
                "type": "range",
                "bounds": [0.2, 1.7],
              },
              {
                "name": "input_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "bias_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "alpha",
                "value_type": "float",
                "type": "range",
                "log_scale": True,
                "bounds": [1e-3, 1e3],
              },
              {
                "name": "reservoir_dim",
                "value_type": "int",
                "type": "fixed",
                "value": res_dim,
              }
            ],
            # Booth function
            evaluation_function = wrapped_fitness,
            minimize = False,
            objective_name = 'val_accuracy',
            total_trials = 40
        )
    
    # results
    best_params_d[res_dim] = {}
    best_params_d[res_dim]['best_parameters'] = best_parameters
    best_params_d[res_dim]['best_values'] = best_values
    best_params_d[res_dim]['experiment'] = experiment
    # best_params_d[res_dim]['model'] = model

[INFO 05-30 21:40:50] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 6 trials, GPEI for subsequent trials]). Iterations after 6 will take longer to generate due to  model-fitting.
[INFO 05-30 21:40:50] ax.service.managed_loop: Started full optimization with 40 steps.
[INFO 05-30 21:40:50] ax.service.managed_loop: Running optimization trial 1...
[INFO 05-30 22:05:45] ax.service.managed_loop: Running optimization trial 2...
[INFO 05-30 22:30:37] ax.service.managed_loop: Running optimization trial 3...


## Results

In [None]:
# best parameters

with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'wb') as fh:
    pickle.dump(best_params_d, fh)

In [16]:
# # load results
# with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
#     best_params_d = pickle.load(fh)

In [17]:
best_params_d

{500: {'best_parameters': {'leaking_rate': 0.9040444911438811,
   'spectral_radius': 0.20000000000000126,
   'input_scaling': 0.10000000000005109,
   'bias_scaling': 0.10000000000001198,
   'alpha': 0.07885384448822817,
   'reservoir_dim': 500},
  'best_values': ({'val_accuracy': 90.43231354512487},
   {'val_accuracy': {'val_accuracy': 0.010369366007225113}}),
  'experiment': SimpleExperiment(None)},
 1000: {'best_parameters': {'leaking_rate': 0.7754061698629553,
   'spectral_radius': 0.2,
   'input_scaling': 0.10000000001325081,
   'bias_scaling': 2.1421750283029133,
   'alpha': 0.0011090268898778523,
   'reservoir_dim': 1000},
  'best_values': ({'val_accuracy': 90.97954458186517},
   {'val_accuracy': {'val_accuracy': 7.343272431617016e-05}}),
  'experiment': SimpleExperiment(None)},
 3000: {'best_parameters': {'leaking_rate': 0.5478866628014272,
   'spectral_radius': 1.050724183703662,
   'input_scaling': 0.5926856626014334,
   'bias_scaling': 3.0,
   'alpha': 0.305741473239415,
   '

In [18]:
# results

results_d = {}

for res_dim in [500, 1000, 3000, 5000]:
    
    best_parameters = best_params_d[res_dim]['best_parameters']
    acc, acc_std, time, time_std = wrapped_fitness(best_parameters, return_test_acc=True)
    results_d[res_dim] = acc, acc_std, time, time_std
    print("Experiment finished.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weig

Experiment finished.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weig

Experiment finished.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weig

Experiment finished.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weig

Experiment finished.


In [22]:
results_d

{500: (90.73946990966797,
  0.11632705449085054,
  408.56813515666875,
  1.1455081803793339),
 1000: (91.22894134521485,
  0.04894974472903971,
  407.6003307029605,
  1.0327152541312996),
 3000: (91.58420867919922,
  0.08221293855280151,
  429.60137262716887,
  0.9028311085525002),
 5000: (91.67894287109375,
  0.053025874842887864,
  500.0101929595694,
  2.9950660369119095)}

In [23]:
with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
    pickle.dump(results_d, fh)

In [24]:
results_d

{500: (90.73946990966797,
  0.11632705449085054,
  408.56813515666875,
  1.1455081803793339),
 1000: (91.22894134521485,
  0.04894974472903971,
  407.6003307029605,
  1.0327152541312996),
 3000: (91.58420867919922,
  0.08221293855280151,
  429.60137262716887,
  0.9028311085525002),
 5000: (91.67894287109375,
  0.053025874842887864,
  500.0101929595694,
  2.9950660369119095)}

In [34]:
# # load results
# with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'rb') as fh:
#     results_d = pickle.load(fh)

In [35]:
results_d

{500: (90.10525970458984,
  0.10945550684716364,
  297.90255975187756,
  3.3313685829787247),
 1000: (90.73420715332031,
  0.17562757065845663,
  297.3748939599842,
  4.823391432702744),
 3000: (91.11578521728515,
  0.13959970852840675,
  298.7870469949674,
  4.94682598925355),
 5000: (91.30262756347656,
  0.1755489999248224,
  304.08307738858275,
  5.16690769692509)}

In [10]:
# load results
with open(os.path.join(RESULTS_PATH, 'AGnews-4_results.pkl'), 'rb') as fh:
    results = pickle.load(fh)

In [11]:
results

{500: (90.10525970458984,
  0.10945550684716364,
  297.90255975187756,
  3.3313685829787247),
 1000: (90.73420715332031,
  0.17562757065845663,
  297.3748939599842,
  4.823391432702744),
 3000: (91.11578521728515,
  0.13959970852840675,
  298.7870469949674,
  4.94682598925355),
 5000: (91.30262756347656,
  0.1755489999248224,
  304.08307738858275,
  5.16690769692509)}