# Tutorial

This notebook provides a use case example of the ``EsnTorch`` library.
It described the implementation of the co-called Custom Baseline (CBS)
for text classification on the IMDB dataset.

The instantiation, training and evaluation of the CBS for text classification
is achieved via the following steps:
- Import the required modules
- Create the dataloaders
- Instantiate the CBS by specifying:
    - a reservoir (not recurrent)
    - a loss function
    - a learning algorithm
- Train the CBS
- Training and testing results

## Librairies

In [1]:
# !pip3 install datasets==1.7.0

In [2]:
# Comment this if library is installed
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

In [3]:
# import numpy as np
from sklearn.metrics import classification_report

import torch

from datasets import load_dataset, Dataset, concatenate_datasets

from transformers import AutoTokenizer
from transformers.data.data_collator import DataCollatorWithPadding

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn
import esntorch.core.baseline as bs

## Device and Seed

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load and Tokenize Data

In [5]:
# Custom functions for loading and preparing data

def tokenize(sample):
    """Tokenize sample"""
    
    sample = tokenizer(sample['text'], truncation=True, padding=False, return_length=True)
    
    return sample
    
def load_and_prepare_dataset(dataset_name, split, cache_dir):
    """
    Load dataset from the datasets library of HuggingFace.
    Tokenize and add length.
    """
    
    # Load dataset
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    # Rename label column (for tokenization prupses)
    dataset = dataset.rename_column('label', 'labels')
    
    # Tokenize data
    dataset = dataset.map(tokenize, batched=True)
    dataset = dataset.rename_column('length', 'lengths')
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths'])
    
    return dataset

In [6]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Load and prepare data
# CACHE_DIR = 'cache_dir/' # put your path here
CACHE_DIR = '/raid/home/jeremiec/huggingface_datasets' # XXX REMOVE THIS FOR DEPLOYMENT

full_dataset = load_and_prepare_dataset('imdb', split=None, cache_dir=CACHE_DIR)
train_dataset = full_dataset['train'].sort("lengths")
test_dataset = full_dataset['test'].sort("lengths")

# Create dict of all datasets
dataset_d = {
    'train': train_dataset,
    'test': test_dataset
    }

Reusing dataset imdb (/raid/home/jeremiec/huggingface_datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b0be00ab1a0eeba4.arrow
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-9b3c67140e21108b.arrow
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-373655659cfd37d5.arrow
Loading cached sorted indices for dataset at /raid/home/jeremiec/huggingface_datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b3d04800af07ba43.arrow
Loading cached sorted indices for dataset at /raid/home/jeremiec/huggingface_datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-74355403e93a3952.arrow


In [7]:
dataset_d

{'train': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 25000
 })}

In [8]:
# Create dict of dataloaders

dataloader_d = {}

for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

In [9]:
dataloader_d

{'train': <torch.utils.data.dataloader.DataLoader at 0x7f29f71bceb0>,
 'test': <torch.utils.data.dataloader.DataLoader at 0x7f29f2a4f7f0>}

## Model

In [10]:
# CBS parameters
cbs_params = {
            'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
            'input_dim' : 768,                        # dim of BERT encoding!
            'reservoir_dim' : 1000,
            'bias_scaling' : 1.0, #1.0,
            'input_scaling' : 1.0,
            'activation_function' : 'relu',           # 'tanh', relu'
            #'learning_algo' : None, # initialzed below
            #'criterion' : None,     # initialzed below
            #'optimizer' : None,     # initialzed below
            'merging_strategy' : 'mean',
            'bidirectional' : False, # True
            'device' : device,
            'seed' : 42
             }

# Instantiate the CBS
CBS = bs.CustomBaseline(**cbs_params)

# Define the learning algo of the CBS
CBS.learning_algo = la.RidgeRegression(alpha=10)

# Put the CBS on the device (CPU or GPU)
CBS = CBS.to(device)

Invalid distribution of reservoir ('uniform' or 'gaussian')...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


## Training

In [11]:
%%time
# training the CBS
CBS.fit(dataloader_d["train"])

lenghts tensor([11, 13, 15, 16, 19, 21, 22, 22, 24, 25, 26, 27, 27, 28, 28, 28, 29, 30,
        30, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37,
        37, 37, 37, 38, 38, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 41,
        41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 44, 44,
        44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46,
        46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
        47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48,
        48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
        51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
        52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53,
        53, 53, 53, 53, 53, 53, 

KeyboardInterrupt: 

## Results

In [12]:
# Train predictions and accuracy
train_pred, train_acc = CBS.predict(dataloader_d["train"], verbose=False)
train_acc.item()

lenghts tensor([11, 13, 15, 16, 19, 21, 22, 22, 24, 25, 26, 27, 27, 28, 28, 28, 29, 30,
        30, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37,
        37, 37, 37, 38, 38, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 41,
        41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 44, 44,
        44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46,
        46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
        47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48,
        48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51,
        51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
        52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53,
        53, 53, 53, 53, 53, 53, 

TypeError: mm(): argument 'mat2' (position 2) must be Tensor, not NoneType

In [None]:
# Test predictions and accuracy
test_pred, test_acc = CBS.predict(dataloader_d["test"], verbose=False)
test_acc.item()

In [None]:
# Test classification report
print(classification_report(test_pred.tolist(), 
                            dataset_d['test']['labels'].tolist(), 
                            digits=4))