# Tutorial

This notebook provides a use case example of the ``EsnTorch`` library.
It described the implementation of an Echo State Network (ESN)
for text classification on the IMDB dataset.

The instantiation, training and evaluation of an ESN for text classification
is achieved via the following steps:
- Import the required modules
- Create the dataloaders
- Instantiate the ESN by specifying:
    - a reservoir
    - a loss function
    - a learning algorithm
- Train the ESN
- Training and testing results

## Librairies

In [4]:
# !pip install -U scikit-learn
# !pip install transformers==4.8.2
# !pip install datasets==1.7.0
# !pip install ipywidgets

For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [5]:
# Comment this if library is installed
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

In [6]:
# import numpy as np
from sklearn.metrics import classification_report

import time

import numpy as np

from sklearn.linear_model import RidgeClassifier

import torch

from datasets import load_dataset, Dataset, concatenate_datasets

from transformers import AutoTokenizer
from transformers.data.data_collator import DataCollatorWithPadding

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn

In [7]:
%load_ext autoreload
%autoreload 2

## Device and Seed

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load and Tokenize Data

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [10]:
# Custom functions for loading and preparing data

def tokenize(sample):
    """Tokenize sample"""
    
    sample = tokenizer(sample['text'], truncation=True, padding=False, return_length=True)
    
    return sample

def load_and_prepare_dataset(dataset_name, split, cache_dir):
    """
    Load dataset from the datasets library of HuggingFace.
    Tokenize and add length.
    """
    
    # Load dataset
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    # Rename label column for tokenization purposes
    dataset = dataset.rename_column('label', 'labels')
    
    # Tokenize data
    dataset = dataset.map(tokenize, batched=True)
    dataset = dataset.rename_column('length', 'lengths')
    # dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths'])
    
    return dataset

In [11]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Load and prepare data
CACHE_DIR = 'cache_dir/' # put your path here

full_dataset = load_and_prepare_dataset('imdb', split=None, cache_dir=CACHE_DIR)

# *** JUST FOR NOW ***
# full_dataset = full_dataset['train'].train_test_split(test_size=0.1)['test'] # sample 10%
# full_dataset = full_dataset.train_test_split(test_size=0.2)
# for split in full_dataset.keys():
#     full_dataset[split] = full_dataset[split].flatten_indices() # needs this to reset indices
# *** JUST FOR NOW ***
train_dataset = full_dataset['train'].sort("lengths")
test_dataset = full_dataset['test'].sort("lengths")

# Create dict of all datasets
dataset_d = {
    'train': train_dataset,
    'test': test_dataset
    }

Reusing dataset imdb (cache_dir/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [12]:
dataset_d

{'train': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'text', 'token_type_ids'],
     num_rows: 25000
 })}

In [13]:
def get_tfidf_features(dataset_d):
    """Compute tf-idf features for the dataset"""

    t0 = time.time()
    
    vectorizer = TfidfVectorizer(max_features=4000)
    vectorizer.fit(dataset_d['train']['text'])
    
    tfidf_fts = {}
    
    for split in dataset_d.keys():
        X_tmp = vectorizer.transform(dataset_d[split]['text'])
        X_tmp = list(X_tmp.todense())
        X_tmp = [np.asarray(row).reshape(-1) for row in X_tmp]
        tfidf_fts[split] = X_tmp
        
        dataset_d[split] = dataset_d[split].add_column("additional_fts", X_tmp)
        dataset_d[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths', 'additional_fts'])
        dataset_d[split] = dataset_d[split].remove_columns("text")
    
    t1 = time.time()
    
    return dataset_d, t1-t0

In [14]:
dataset_d, fts_time = get_tfidf_features(dataset_d)

In [15]:
dataset_d

{'train': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'token_type_ids', 'additional_fts'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'token_type_ids', 'additional_fts'],
     num_rows: 25000
 })}

In [16]:
fts_time

21.66327428817749

In [17]:
dataset_d

{'train': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'token_type_ids', 'additional_fts'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['attention_mask', 'input_ids', 'labels', 'lengths', 'token_type_ids', 'additional_fts'],
     num_rows: 25000
 })}

In [18]:
# Create dict of dataloaders

dataloader_d = {}

for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

In [19]:
dataloader_d

{'train': <torch.utils.data.dataloader.DataLoader at 0x7f8b60397cd0>,
 'test': <torch.utils.data.dataloader.DataLoader at 0x7f8b60397250>}

In [20]:
for b in dataloader_d['train']:
    break

In [21]:
b['additional_fts'].shape

torch.Size([256, 4000])

## Model

In [27]:
# ESN parameters
esn_params = {
            'embedding': 'bert-base-uncased', # TEXT.vocab.vectors,
            'distribution' : 'gaussian',              # uniform, gaussian
            'input_dim' : 768,                        # dim of BERT encoding!
            'reservoir_dim' : 1000,
            'input_scaling' : 1.0,
            'bias_scaling' : 1.0, # can be None
            'sparsity' : 0.99,
            'spectral_radius' : 0.9,
            'leaking_rate': 0.5,
            'activation_function' : 'relu',           # 'tanh', relu'
            'mean' : 0.0,
            'std' : 1.0,
            #'learning_algo' : None, # initialzed below
            #'criterion' : None,     # initialzed below
            #'optimizer' : None,     # initialzed below
            'merging_strategy' : 'mean_and_additional_fts', # 'mean_and_additional_fts', # 'mean'
            'bidirectional' : False, # True
            'device' : device,
            'seed' : 42,
            'mode': 'no_layer' # 'esn', 'custom_baseline'
             }

# Instantiate the ESN
ESN = esn.EchoStateNetwork(**esn_params)

# Define the learning algo of the ESN
ESN.learning_algo = la.RidgeRegression(alpha=10)
# ESN.learning_algo = la.RidgeRegression2()
# ESN.learning_algo = la.LinearSVC()
# ESN.learning_algo = la.LogisticRegression2()

# Put the ESN on the device (CPU or GPU)
ESN = ESN.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


In [28]:
# Warm up the ESN on 5 sentences
nb_sentences = 5

for i in range(nb_sentences): 
    sentence = dataset_d["train"].select([i])
    dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                 batch_size=1, 
                                                 collate_fn=DataCollatorWithPadding(tokenizer))  

    for sentence in dataloader_tmp:
        ESN.warm_up(sentence)

## Training

In [29]:
%%time
# training the ESN
ESN.fit(dataloader_d["train"])

CPU times: user 1min 48s, sys: 31.5 s, total: 2min 19s
Wall time: 2min 11s


## Results

In [30]:
# Train predictions and accuracy
train_pred, train_acc = ESN.predict(dataloader_d["train"], verbose=False)
train_acc

96.532

In [31]:
# Test predictions and accuracy
test_pred, test_acc = ESN.predict(dataloader_d["test"], verbose=False)
test_acc

95.272

In [32]:
# Test classification report
print(classification_report(dataset_d['test']['labels'].tolist(),
                            test_pred.tolist(),
                            digits=4))

              precision    recall  f1-score   support

           0     0.9585    0.9464    0.9524     12500
           1     0.9471    0.9590    0.9530     12500

    accuracy                         0.9527     25000
   macro avg     0.9528    0.9527    0.9527     25000
weighted avg     0.9528    0.9527    0.9527     25000



In [10]:
corpus = dataset_d['train']['text']
vectorizer = TfidfVectorizer(max_features=3000)
X_train = vectorizer.fit_transform(corpus)
X_test = vectorizer.transform(dataset_d['test']['text'])

In [12]:
# model = LinearSVC()
# model = LogisticRegression(C=2)
model = RidgeClassifier(alpha=10)

model.fit(X_train, dataset_d['train']['labels'])
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [13]:
print(classification_report(dataset_d['train']['labels'], y_train_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9073    0.8845    0.8957     12500
           1     0.8873    0.9096    0.8983     12500

    accuracy                         0.8970     25000
   macro avg     0.8973    0.8970    0.8970     25000
weighted avg     0.8973    0.8970    0.8970     25000



In [14]:
print(classification_report(dataset_d['test']['labels'], y_test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8864    0.8677    0.8769     12500
           1     0.8704    0.8888    0.8795     12500

    accuracy                         0.8782     25000
   macro avg     0.8784    0.8782    0.8782     25000
weighted avg     0.8784    0.8782    0.8782     25000

