# Baseline SST-2: TEXT Classification + BERT + Ax

## Librairies

- Need ``datasets==1.7.0``
- Need ``ax-platform==0.1.20``

Install them from command line if necessary.

In [1]:
import os
import sys

In [2]:
import io
import re
import pickle
from timeit import default_timer as timer

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.baseline as bs

In [3]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
SEED = 42

## Global variables

In [7]:
RESULTS_PATH = '~/Results/Ax_results/Baseline' # path of your result folder
CACHE_DIR = '~/Data/huggignface/'              # path of your  folder

PARAMS_FILE = 'sst-2_baseline_params.pkl'
RESULTS_FILE = 'sst-2_baseline_results.pkl'

## Dataset

In [25]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(*dataset_name, split, cache_dir):
    
    dataset = load_dataset(*dataset_name, split=split, cache_dir=CACHE_DIR)
    
    def clean(example):
        example['text'] = example['text'].replace('-LRB-', '(').replace('-RRB-', ')').replace(r'\/', r'/')
#         example['label'] = np.abs(example['label'] - 1) # revert labels of test set
        return example
        
    if 'sentence' in dataset.column_names:
        dataset = dataset.rename_column('sentence', 'text')
        
    dataset = dataset.map(clean)
    dataset = dataset.rename_column('label', 'labels')
    
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=False), batched=True)
    
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def add_lengths(sample):
        sample["lengths"] = sum(sample["input_ids"] != 0)
        return sample
    
    dataset = dataset.map(add_lengths, batched=False)
    
    return dataset

In [26]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_dataset = load_and_enrich_dataset('glue', 'sst2', split='train', cache_dir=CACHE_DIR).sort("lengths")
val_dataset = load_and_enrich_dataset('glue', 'sst2', split='validation', cache_dir=CACHE_DIR).sort("lengths")
test_dataset = load_and_enrich_dataset('gpt3mix/sst2', split='test', cache_dir=CACHE_DIR).sort("lengths")
def revert(example):
    example['labels'] = np.abs(example['labels'] - 1) # revert labels of test set
    return example
test_dataset = test_dataset.map(revert) # revert labels of test set


dataset_d = {
    'full_train': train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

Reusing dataset glue (/raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


HBox(children=(FloatProgress(value=0.0, max=67349.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=68.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67349.0), HTML(value='')))

Reusing dataset glue (/raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)





HBox(children=(FloatProgress(value=0.0, max=872.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=872.0), HTML(value='')))

Using custom data configuration default
Reusing dataset ss_t2 (/raid/home/jeremiec/huggingface_datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)





HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




In [27]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 67349
 }),
 'train': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 67349
 }),
 'val': Dataset({
     features: ['attention_mask', 'idx', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 872
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 1821
 })}

## Optimization

In [11]:
def fitness(alpha, 
            dataset_d, 
            dataloader_d, 
            return_test_acc=False):
    
    # parameters
    esn_params = {
                'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
                'input_dim' : 768,                        # dim of encoding!
                'learning_algo' : None,
                'criterion' : None,
                'optimizer' : None,
                'merging_strategy' : 'mean',
                'lexicon' : None,
                'bidirectional' : False,
                'device' : device,
                'seed' : 42
                 }

    # model
    ESN = bs.Baseline(**esn_params)

    ESN.learning_algo = la.RidgeRegression(alpha = alpha)# , mode='normalize')

    ESN = ESN.to(device)

    # predict
    if return_test_acc:
        t0 = timer()
        LOSS = ESN.fit(dataloader_d["full_train"])
        t1 = timer()
        acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item()
    else:
        LOSS = ESN.fit(dataloader_d["train"])
        acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()

    # clean objects
    del ESN.learning_algo
    del ESN.criterion
    del ESN.merging_strategy
    del ESN
    torch.cuda.empty_cache()
    
    if return_test_acc:
        return acc, t1 - t0 
    else:
        return acc

In [12]:
# %%time

# fitness(alpha=10, dataset_d=dataset_d, dataloader_d=dataloader_d)

In [13]:
def wrapped_fitness(d, return_test_acc=False):
    
    return fitness(alpha=d['alpha'],
                   dataset_d=dataset_d,
                   dataloader_d=dataloader_d,
                   return_test_acc=return_test_acc)

In [14]:
best_params_d = {}

best_parameters, best_values, experiment, model = optimize(
        parameters=[
          {
            "name": "alpha",
            "value_type": "float",
            "type": "range",
            "log_scale": True,
            "bounds": [1e-3, 1e3],
          }
        ],
        # Booth function
        evaluation_function = wrapped_fitness,
        minimize = False,
        objective_name = 'val_accuracy',
        total_trials = 10
    )

# results
best_params_d['best_parameters'] = best_parameters
best_params_d['best_values'] = best_values
best_params_d['experiment'] = experiment
# best_params_d[res_dim]['model'] = model

[INFO 06-13 17:00:33] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 5 trials, GPEI for subsequent trials]). Iterations after 5 will take longer to generate due to  model-fitting.
[INFO 06-13 17:00:33] ax.service.managed_loop: Started full optimization with 10 steps.
[INFO 06-13 17:00:33] ax.service.managed_loop: Running optimization trial 1...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:02:28] ax.service.managed_loop: Running optimization trial 2...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:04:19] ax.service.managed_loop: Running optimization trial 3...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:06:11] ax.service.managed_loop: Running optimization trial 4...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:08:02] ax.service.managed_loop: Running optimization trial 5...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:09:55] ax.service.managed_loop: Running optimization trial 6...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:11:47] ax.service.managed_loop: Running optimization trial 7...

A not p.d., added jitter of 1.0e-08 to the diagonal



Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:13:38] ax.service.managed_loop: Running optimization trial 8...

A not p.d., added jitter of 1.0e-08 to the diagonal



Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:15:28] ax.service.managed_loop: Running optimization trial 9...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 06-13 17:17:19] ax.service.managed_loop: Running optimization trial 10...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


## Results

In [18]:
# best parameters

with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'wb') as fh:
    pickle.dump(best_params_d, fh)

In [28]:
# # load results
# with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
#     best_params_d = pickle.load(fh)

In [30]:
best_params_d

{'best_parameters': {'alpha': 0.41197415070304855},
 'best_values': ({'val_accuracy': 85.78022198433894},
  {'val_accuracy': {'val_accuracy': 2.3731999495468357e-06}}),
 'experiment': SimpleExperiment(None)}

In [31]:
# results

best_parameters = best_params_d['best_parameters']
acc, time = wrapped_fitness(best_parameters, return_test_acc=True)
results_tuple = acc, time
print("Experiment finished.")

Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...
Experiment finished.


In [32]:
results_tuple

(85.33773040771484, 105.52347468817607)

In [33]:
with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
    pickle.dump(results_tuple, fh)