# TREC-50: TEXT Classification + BERT + Ax

## Librairies

- Need ``datasets==1.7.0``
- Need ``ax-platform==0.1.20``

Install them from command line if necessary.

In [2]:
import os
import sys

In [5]:
import io
import re
import pickle
from timeit import default_timer as timer

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [6]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [8]:
SEED = 42

## Global variables

In [7]:
RESULTS_PATH = '~/Results/Ax_results/ESN' # path of your result folder
CACHE_DIR = '~/Data/huggignface/'         # path of your  folder

PARAMS_FILE = 'trec-50_opt_params.pkl'
RESULTS_FILE = 'trec-50_opt_results.pkl'

## Dataset

In [8]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(dataset_name, split, cache_dir):
    
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    dataset = dataset.rename_column('label-fine', 'labels') # 'label-coarse'
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=False), batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def add_lengths(sample):
        sample["lengths"] = sum(sample["input_ids"] != 0)
        return sample
    
    dataset = dataset.map(add_lengths, batched=False)
    
    return dataset

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

full_train_dataset = load_and_enrich_dataset('trec', split='train', cache_dir=CACHE_DIR).sort("lengths")
train_val_datasets = full_train_dataset.train_test_split(train_size=0.8, shuffle=True)
train_dataset = train_val_datasets['train'].sort("lengths")
val_dataset = train_val_datasets['test'].sort("lengths")

test_dataset = load_and_enrich_dataset('trec', split='test', cache_dir=CACHE_DIR).sort("lengths")

dataset_d = {
    'full_train': full_train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

Using custom data configuration default
Reusing dataset trec (/raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9)


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5452.0), HTML(value='')))




Using custom data configuration default
Reusing dataset trec (/raid/home/jeremiec/huggingface_datasets/trec/default/1.1.0/751da1ab101b8d297a3d6e9c79ee9b0173ff94c4497b75677b59b61d5467a9b9)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [10]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'input_ids', 'label-coarse', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 5452
 }),
 'train': Dataset({
     features: ['attention_mask', 'input_ids', 'label-coarse', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 4361
 }),
 'val': Dataset({
     features: ['attention_mask', 'input_ids', 'label-coarse', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 1091
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'label-coarse', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 500
 })}

## Optimization

In [11]:
def fitness(leaking_rate, 
            spectral_radius, 
            input_scaling, 
            bias_scaling, 
            alpha, 
            reservoir_dim, 
            dataset_d, 
            dataloader_d, 
            seed_l=[1991, 420, 666, 1979, 7], # 5 seeds
            return_test_acc=False):
    
    acc_l = []
    time_l = []
    
    for seed in seed_l:
    
        # parameters
        esn_params = {
                    'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
                    'distribution' : 'uniform',              # uniform, gaussian
                    'input_dim' : 768,                        # dim of encoding!
                    'reservoir_dim' : reservoir_dim,
                    'bias_scaling' : bias_scaling,
                    'sparsity' : 0.99,
                    'spectral_radius' : spectral_radius,
                    'leaking_rate': leaking_rate,
                    'activation_function' : 'tanh',
                    'input_scaling' : input_scaling,
                    'mean' : 0.0,
                    'std' : 1.0,
                    'learning_algo' : None,
                    'criterion' : None,
                    'optimizer' : None,
                    'merging_strategy' : 'mean',
                    'lexicon' : None,
                    'bidirectional' : True, # False
                    'device' : device,
                    'seed' : seed
                     }

        # model
        ESN = esn.EchoStateNetwork(**esn_params)

        ESN.learning_algo = la.RidgeRegression(alpha = alpha)# , mode='normalize')

        ESN = ESN.to(device)

        # warm up (new)
        nb_sentences = 3
        for i in range(nb_sentences): 

            sentence = dataset_d["train"].select([i])
            dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                         batch_size=1, 
                                                         collate_fn=DataCollatorWithPadding(tokenizer))  

            for sentence in dataloader_tmp:
                ESN.warm_up(sentence)
        
        # predict
        if return_test_acc:
            t0 = timer()
            LOSS = ESN.fit(dataloader_d["full_train"])
            t1 = timer()
            time_l.append(t1 - t0)
            acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item()
        else:
            LOSS = ESN.fit(dataloader_d["train"])
            acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()

        acc_l.append(acc)
        
        # clean objects
        del ESN.learning_algo
        del ESN.criterion
        del ESN.merging_strategy
        del ESN
        torch.cuda.empty_cache()
    
    if return_test_acc:
        return np.mean(acc_l), np.std(acc_l), np.mean(time_l), np.std(time_l)
    else:
        return np.mean(acc_l)

In [12]:
# %%time

# fitness(leaking_rate=0.2, spectral_radius=1.1, input_scaling=0.8, bias_scaling=1.0, alpha=10, reservoir_dim=500, dataset_d=dataset_d, dataloader_d=dataloader_d)

In [13]:
def wrapped_fitness(d, return_test_acc=False):
    
    return fitness(leaking_rate=d['leaking_rate'],
                   spectral_radius=d['spectral_radius'],
                   input_scaling=d['input_scaling'],
                   bias_scaling=d['bias_scaling'],
                   alpha=d['alpha'],
                   reservoir_dim=d['reservoir_dim'], # will be in the loop
                   dataset_d=dataset_d,
                   dataloader_d=dataloader_d,
                   return_test_acc=return_test_acc)

In [14]:
# *** WARNING *** DO NO EXECUTE NEXT CELLS IF BIDIRECTIONAL MODE (OPTIM ALREADY DONE)

In [None]:
best_params_d = {}

for res_dim in [500, 1000, 3000, 5000]:

    best_parameters, best_values, experiment, model = optimize(
            parameters=[
              {
                "name": "leaking_rate",
                "value_type": "float",
                "type": "range",
                "bounds": [0.0, 0.999],
              },
              {
                "name": "spectral_radius",
                "value_type": "float",
                "type": "range",
                "bounds": [0.2, 1.7],
              },
              {
                "name": "input_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "bias_scaling",
                "value_type": "float",
                "type": "range",
                "bounds": [0.1, 3.0],
              },
              {
                "name": "alpha",
                "value_type": "float",
                "type": "range",
                "log_scale": True,
                "bounds": [1e-3, 1e3],
              },
              {
                "name": "reservoir_dim",
                "value_type": "int",
                "type": "fixed",
                "value": res_dim,
              }
            ],
            # Booth function
            evaluation_function = wrapped_fitness,
            minimize = False,
            objective_name = 'val_accuracy',
            total_trials = 40
        )
    
    # results
    best_params_d[res_dim] = {}
    best_params_d[res_dim]['best_parameters'] = best_parameters
    best_params_d[res_dim]['best_values'] = best_values
    best_params_d[res_dim]['experiment'] = experiment
    # best_params_d[res_dim]['model'] = model

[INFO 06-30 06:33:03] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 6 trials, GPEI for subsequent trials]). Iterations after 6 will take longer to generate due to  model-fitting.
[INFO 06-30 06:33:03] ax.service.managed_loop: Started full optimization with 40 steps.
[INFO 06-30 06:33:03] ax.service.managed_loop: Running optimization trial 1...
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequence

## Results

In [18]:
# best parameters

with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'wb') as fh:
    pickle.dump(best_params_d, fh)

In [23]:
# best_params_d

In [None]:
# load results
with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
    best_params_d = pickle.load(fh)

In [None]:
# best_params_d

In [20]:
# results

results_d = {}

for res_dim in [500, 1000, 3000, 5000]:
    
    best_parameters = best_params_d[res_dim]['best_parameters']
    acc, acc_std, time, time_std = wrapped_fitness(best_parameters, return_test_acc=True)
    results_d[res_dim] = acc, acc_std, time, time_std
    print("Experiment finished.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.pr

Experiment finished.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.pr

Experiment finished.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.pr

Experiment finished.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.pr

Experiment finished.


In [24]:
results_d

{500: (78.12000274658203,
  0.765244628092488,
  11.35018432578072,
  0.33659645269729005),
 1000: (80.36000366210938,
  0.2939396827814494,
  11.383961046859621,
  0.16903909171625667),
 3000: (82.4800048828125,
  0.7858756654361689,
  24.536760101187973,
  0.26341985473393625),
 5000: (86.00000305175782,
  0.5059657525733958,
  58.35188771234825,
  2.0243483393869184)}

In [25]:
with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
    pickle.dump(results_d, fh)

In [None]:
# load results
with open(os.path.join(RESULTS_PATH, 'trec-50_bidirectional_results.pkl'), 'rb') as fh:
    results_d = pickle.load(fh)

In [None]:
results_d

In [None]:
# Didn't work well probably because the validation set is too small to represent the 50 classes!!!