# Baseline TREC-6: TEXT Classification + BERT + Ax

## Librairies

In [None]:
# !pip install transformers==4.8.2
# !pip install datasets==1.7.0
# !pip install ax-platform==0.1.20

In [1]:
import os
import sys

In [2]:
import io
import re
import pickle
from timeit import default_timer as timer

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers.data.data_collator import DataCollatorWithPadding

from ax import optimize
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.baseline as bs

In [3]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
SEED = 42

## Global variables

In [6]:
RESULTS_PATH = '~/Results/Ax_results/Baseline' # path of your result folder
CACHE_DIR = '~/Data/huggignface/'              # path of your  folder

PARAMS_FILE = 'trec-6_baseline_params.pkl'
RESULTS_FILE = 'trec-6_baseline_results.pkl'

## Dataset

In [7]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(dataset_name, split, cache_dir):
    
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    dataset = dataset.rename_column('label-coarse', 'labels') # 'label-fine'
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=False), batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def add_lengths(sample):
        sample["lengths"] = sum(sample["input_ids"] != 0)
        return sample
    
    dataset = dataset.map(add_lengths, batched=False)
    
    return dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

full_train_dataset = load_and_enrich_dataset('trec', split='train', cache_dir=CACHE_DIR).sort("lengths")
train_val_datasets = full_train_dataset.train_test_split(train_size=0.8, shuffle=True)
train_dataset = train_val_datasets['train'].sort("lengths")
val_dataset = train_val_datasets['test'].sort("lengths")

test_dataset = load_and_enrich_dataset('trec', split='test', cache_dir=CACHE_DIR).sort("lengths")

dataset_d = {
    'full_train': full_train_dataset,
    'train': train_dataset,
    'val': val_dataset,
    'test': test_dataset
    }

dataloader_d = {}
for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

In [9]:
dataset_d

{'full_train': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 5452
 }),
 'train': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 4361
 }),
 'val': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 1091
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'label-fine', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 500
 })}

## Optimization

In [12]:
def fitness(alpha, 
            dataset_d, 
            dataloader_d, 
            return_test_acc=False):
    
    # parameters
    esn_params = {
                'embedding': 'bert-base-uncased', # TEXT.vocab.vectors,
                'input_dim' : 768,                        # dim of encoding!
                'learning_algo' : None,
                'criterion' : None,
                'optimizer' : None,
                'merging_strategy' : 'mean',
                'lexicon' : None,
                'bidirectional' : False,
                'device': device,
                'seed' : 42
                 }

    # model
    ESN = bs.Baseline(**esn_params)

    ESN.learning_algo = la.RidgeRegression(alpha = alpha)# , mode='normalize')

    ESN = ESN.to(device)

    # predict
    if return_test_acc:
        t0 = timer()
        LOSS = ESN.fit(dataloader_d["train"])
        t1 = timer()
        acc = ESN.predict(dataloader_d["test"], verbose=False)[1].item()
    else:
        LOSS = ESN.fit(dataloader_d["train"])
        acc = ESN.predict(dataloader_d["val"], verbose=False)[1].item()

    # clean objects
    del ESN.learning_algo
    del ESN.criterion
    del ESN.merging_strategy
    del ESN
    torch.cuda.empty_cache()
    
    if return_test_acc:
        return acc, t1 - t0 
    else:
        return acc

In [13]:
# %%time

# fitness(alpha=10, dataset_d=dataset_d, dataloader_d=dataloader_d)

Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


86.06782531738281

In [14]:
def wrapped_fitness(d, return_test_acc=False):
    
    return fitness(alpha=d['alpha'],
                   dataset_d=dataset_d,
                   dataloader_d=dataloader_d,
                   return_test_acc=return_test_acc)

In [15]:
best_params_d = {}

best_parameters, best_values, experiment, model = optimize(
        parameters=[
          {
            "name": "alpha",
            "value_type": "float",
            "type": "range",
            "log_scale": True,
            "bounds": [1e-3, 1e3],
          }
        ],
        # Booth function
        evaluation_function = wrapped_fitness,
        minimize = False,
        objective_name = 'val_accuracy',
        total_trials = 10
    )

# results
best_params_d['best_parameters'] = best_parameters
best_params_d['best_values'] = best_values
best_params_d['experiment'] = experiment
# best_params_d[res_dim]['model'] = model

[INFO 05-30 11:41:47] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 5 trials, GPEI for subsequent trials]). Iterations after 5 will take longer to generate due to  model-fitting.
[INFO 05-30 11:41:47] ax.service.managed_loop: Started full optimization with 10 steps.
[INFO 05-30 11:41:47] ax.service.managed_loop: Running optimization trial 1...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:41:59] ax.service.managed_loop: Running optimization trial 2...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:42:11] ax.service.managed_loop: Running optimization trial 3...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:42:22] ax.service.managed_loop: Running optimization trial 4...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:42:34] ax.service.managed_loop: Running optimization trial 5...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:42:46] ax.service.managed_loop: Running optimization trial 6...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:42:58] ax.service.managed_loop: Running optimization trial 7...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:43:11] ax.service.managed_loop: Running optimization trial 8...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:43:23] ax.service.managed_loop: Running optimization trial 9...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


[INFO 05-30 11:43:36] ax.service.managed_loop: Running optimization trial 10...


Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...


## Results

In [19]:
# best parameters

with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'wb') as fh:
    pickle.dump(best_params_d, fh)

In [20]:
# # load results
# with open(os.path.join(RESULTS_PATH, PARAMS_FILE), 'rb') as fh:
#     best_params_d = pickle.load(fh)

In [21]:
best_params_d

{'best_parameters': {'alpha': 4.358376639402463},
 'best_values': ({'val_accuracy': 86.66276397071313},
  {'val_accuracy': {'val_accuracy': 0.0008038403067139381}}),
 'experiment': SimpleExperiment(None)}

In [22]:
# results

best_parameters = best_params_d['best_parameters']
acc, time = wrapped_fitness(best_parameters, return_test_acc=True)
results_tuple = acc, time
print("Experiment finished.")

Invalid distribution of reservoir ('uniform' or 'gaussian')...
Activation function unknown...
Experiment finished.


In [23]:
results_tuple

(89.80000305175781, 7.3243069420568645)

In [25]:
with open(os.path.join(RESULTS_PATH, RESULTS_FILE), 'wb') as fh:
    pickle.dump(results_tuple, fh)