# Tutorial

This notebook provides a use case example of the ``EsnTorch`` library.
It described the implementation of an Echo State Network (ESN)
for text classification on the IMDB dataset.

The instantiation, training and evaluation of an ESN for text classification
is achieved via the following steps:
- Import the required modules
- Create the dataloaders
- Instantiate the ESN by specifying:
    - a reservoir
    - a loss function
    - a learning algorithm
- Train the ESN
- Training and testing results

## Librairies

In [None]:
# pip install huggingface-hub==0.0.19

In [None]:
# !pip install -U scikit-learn
# !pip install transformers==4.8.2 # !!!
# !pip install datasets==1.7.0     # !!! # run !pip install huggingface-hub==0.0.19 before!!!
# !pip install ipywidgets

In [3]:
import sklearn
import datasets
import transformers

print("Current versions:")
print(sklearn.__version__)
print(datasets.__version__)
print(transformers.__version__)

Current versions:
1.0.2
2.0.0
4.17.0


For tqdm progress bars (on a terminal):
1. `conda install -c conda-forge nodejs`
2. `jupyter labextension install @jupyter-widgets/jupyterlab-manager`
3. `jupyter nbextension enable --py widgetsnbextension`
4. `jupyter lab clean`
5. Refresh web page...

In [4]:
# Comment this if library is installed
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

In [5]:
# import numpy as np
from sklearn.metrics import classification_report

import time

import numpy as np

from sklearn.linear_model import RidgeClassifier

import torch

from datasets import load_dataset, Dataset, concatenate_datasets

from transformers import AutoTokenizer
from transformers.data.data_collator import DataCollatorWithPadding

import esntorch.core.reservoir as res
import esntorch.core.learning_algo as la
import esntorch.core.merging_strategy as ms
import esntorch.core.esn as esn
import esntorch.utils.embedding as emb

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
# New seeds to see if the problem comes from the seed
torch.manual_seed(123)
np.random.seed(17375)
import random
random.seed(665)

## Device and Seed

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load and Tokenize Data

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [10]:
# Custom functions for loading and preparing data

def tokenize(sample):
    """Tokenize sample"""
    
    sample = tokenizer(sample['text'], truncation=True, padding=False, return_length=True)
    
    return sample

def load_and_prepare_dataset(dataset_name, split, cache_dir):
    """
    Load dataset from the datasets library of HuggingFace.
    Tokenize and add length.
    """
    
    # Load dataset
    dataset = load_dataset(dataset_name, split=split, cache_dir=CACHE_DIR)
    
    # Rename label column for tokenization purposes
    dataset = dataset.rename_column('label', 'labels')
    
    # Tokenize data
    dataset = dataset.map(tokenize, batched=True)
    dataset = dataset.rename_column('length', 'lengths')
    # dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths'])
    
    return dataset

In [11]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Load and prepare data
CACHE_DIR = 'cache_dir/' # put your path here

full_dataset = load_and_prepare_dataset('imdb', split=None, cache_dir=CACHE_DIR)

# *** JUST FOR NOW ***
# full_dataset = full_dataset['train'].train_test_split(test_size=0.1)['test'] # sample 10%
# full_dataset = full_dataset.train_test_split(test_size=0.2)
# for split in full_dataset.keys():
#     full_dataset[split] = full_dataset[split].flatten_indices() # needs this to reset indices
# *** JUST FOR NOW ***
train_dataset = full_dataset['train'].sort("lengths")
test_dataset = full_dataset['test'].sort("lengths")

# Create dict of all datasets
dataset_d = {
    'train': train_dataset,
    'test': test_dataset
    }

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to cache_dir/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to cache_dir/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [12]:
dataset_d

{'train': Dataset({
     features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'lengths'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask', 'lengths'],
     num_rows: 25000
 })}

## Model

In [13]:
def get_tfidf_features(dataset_d):
    """Compute tf-idf features for the dataset"""

    t0 = time.time()
    
    vectorizer = TfidfVectorizer(max_features=4000)
    vectorizer.fit(dataset_d['train']['text'])
    
    tfidf_fts = {}
    
    for split in dataset_d.keys():
        X_tmp = vectorizer.transform(dataset_d[split]['text'])
        X_tmp = list(X_tmp.todense())
        X_tmp = [np.asarray(row).reshape(-1) for row in X_tmp]
        tfidf_fts[split] = X_tmp
        
        dataset_d[split] = dataset_d[split].add_column("additional_fts", X_tmp)
        dataset_d[split].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'lengths', 'additional_fts'])
        dataset_d[split] = dataset_d[split].remove_columns("text")
    
    t1 = time.time()
    
    return dataset_d, t1-t0

In [14]:
dataset_d, fts_time = get_tfidf_features(dataset_d)

Flattening the indices:   0%|          | 0/25 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/25 [00:00<?, ?ba/s]

In [15]:
dataset_d

{'train': Dataset({
     features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'lengths', 'additional_fts'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'lengths', 'additional_fts'],
     num_rows: 25000
 })}

In [16]:
fts_time

42.64738917350769

In [17]:
dataset_d

{'train': Dataset({
     features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'lengths', 'additional_fts'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'lengths', 'additional_fts'],
     num_rows: 25000
 })}

In [18]:
# Create dict of dataloaders

dataloader_d = {}

for k, v in dataset_d.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=256, collate_fn=DataCollatorWithPadding(tokenizer))

In [19]:
dataloader_d

{'train': <torch.utils.data.dataloader.DataLoader at 0x7f9353c016a0>,
 'test': <torch.utils.data.dataloader.DataLoader at 0x7f9353c018b0>}

In [20]:
for b in dataloader_d['train']:
    break

In [21]:
b['additional_fts'].shape

torch.Size([256, 4000])

# Temporary experiments

In [22]:
# # write datasets into files

# import pickle

# tmp_dir = '/raid/home/jeremiec/Data/tmp'

# for split in ['train', 'test']:
    
#     reviews_l = dataset_d[split]['text']
            
#     with open(os.path.join(tmp_dir, str(split)+"_datasets_"+str(datasets.__version__)+".txt"), 'wb') as fh:
        
#         pickle.dump(reviews_l, fh)

In [23]:
# Compare train/test files for old and new datasets versions
import os
import pickle

imdb_d = {}

tmp_dir = '/raid/home/jeremiec/Data/tmp'

for file in os.listdir(tmp_dir):
    
    if file.endswith('txt'):
    
        filename = os.fsdecode(file)
        key = filename[:-4]
        print(key)

        with open(os.path.join(tmp_dir, filename), 'rb') as fh:

            value = pickle.load(fh)

        imdb_d[key] = value

test_datasets_1.7.0
test_datasets_2.0.0
train_datasets_2.0.0
train_datasets_1.7.0


In [24]:
for i, t in enumerate(imdb_d['train_datasets_1.7.0']):
    if t not in imdb_d['train_datasets_2.0.0']:
        print(i)

In [25]:
for i, t in enumerate(imdb_d['test_datasets_1.7.0']):
    if t not in imdb_d['test_datasets_2.0.0']:
        print(i)

**The datasets are the same!**

In [26]:
b['input_ids']

tensor([[  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  1045,  2876,  ...,     0,     0,     0],
        [  101,  7918, 14674,  ...,     0,     0,     0],
        ...,
        [  101,  2054,  2064,  ...,  1005,  1012,   102],
        [  101,  2023,  3185,  ...,  1013,  1028,   102],
        [  101,  2023,  3185,  ...,  2156,   999,   102]])

In [27]:
embedding = emb.EmbeddingModel()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


In [28]:
x = embedding.get_embedding(b)

torch.save(x, os.path.join(tmp_dir, "x_"+str(datasets.__version__)+"_"+str(transformers.__version__)+".pt"))

In [30]:
x2 = torch.load(os.path.join(tmp_dir, "x_"+str(datasets.__version__)+"_"+str(transformers.__version__)+".pt"))

In [31]:
x2.shape

torch.Size([57, 256, 768])

In [32]:
x2 = torch.load(os.path.join(tmp_dir, "x_1.7.0_4.8.2.pt"))

In [33]:
x3 = torch.load(os.path.join(tmp_dir, "x_1.7.0_4.17.0.pt"))

In [34]:
(x2 == x3).all()

tensor(True)

In [35]:
# ESN parameters
esn_params = {
            'embedding_weights': 'bert-base-uncased', # TEXT.vocab.vectors,
            'distribution' : 'gaussian',              # uniform, gaussian
            'input_dim' : 768,                        # dim of BERT encoding!
            'reservoir_dim' : 1000,
            'input_scaling' : 1.0,
            'bias_scaling' : 1.0, # can be None
            'sparsity' : 0.99,
            'spectral_radius' : 0.9,
            'leaking_rate': 0.5,
            'activation_function' : 'relu',           # 'tanh', relu'
            'mean' : 0.0,
            'std' : 1.0,
            #'learning_algo' : None, # initialzed below
            #'criterion' : None,     # initialzed below
            #'optimizer' : None,     # initialzed below
            'merging_strategy' : 'mean_and_additional_fts', # 'mean_and_additional_fts', # 'mean'
            'bidirectional' : False, # True
            'device' : device,
            'seed' : 42,
            'mode': 'no_layer' # 'esn', 'custom_baseline'
             }

# Instantiate the ESN
ESN = esn.EchoStateNetwork(**esn_params)

# Define the learning algo of the ESN
ESN.learning_algo = la.RidgeRegression(alpha=10)
# ESN.learning_algo = la.RidgeRegression2()
# ESN.learning_algo = la.LinearSVC()
# ESN.learning_algo = la.LogisticRegression2()

# Put the ESN on the device (CPU or GPU)
ESN = ESN.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model downloaded: bert-base-uncased


In [36]:
# Warm up the ESN on 5 sentences
nb_sentences = 5

for i in range(nb_sentences): 
    sentence = dataset_d["train"].select([i])
    dataloader_tmp = torch.utils.data.DataLoader(sentence, 
                                                 batch_size=1, 
                                                 collate_fn=DataCollatorWithPadding(tokenizer))  

    for sentence in dataloader_tmp:
        ESN.warm_up(sentence)

## Training

In [37]:
%%time
# training the ESN
ESN.fit(dataloader_d["train"])

CPU times: user 1min 34s, sys: 34.9 s, total: 2min 8s
Wall time: 2min 10s


## Results

In [38]:
# Train predictions and accuracy
train_pred, train_acc = ESN.predict(dataloader_d["train"], verbose=False)
train_acc

92.604

In [39]:
# Test predictions and accuracy
test_pred, test_acc = ESN.predict(dataloader_d["test"], verbose=False)
test_acc

90.32

In [40]:
# Test classification report
print(classification_report(dataset_d['test']['labels'].tolist(),
                            test_pred.tolist(),
                            digits=4))

              precision    recall  f1-score   support

           0     0.9019    0.9048    0.9034     12500
           1     0.9045    0.9016    0.9030     12500

    accuracy                         0.9032     25000
   macro avg     0.9032    0.9032    0.9032     25000
weighted avg     0.9032    0.9032    0.9032     25000



In [None]:
corpus = dataset_d['train']['text']
vectorizer = TfidfVectorizer(max_features=3000)
X_train = vectorizer.fit_transform(corpus)
X_test = vectorizer.transform(dataset_d['test']['text'])

In [None]:
# model = LinearSVC()
# model = LogisticRegression(C=2)
model = RidgeClassifier(alpha=10)

model.fit(X_train, dataset_d['train']['labels'])
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
print(classification_report(dataset_d['train']['labels'], y_train_pred, digits=4))

In [None]:
print(classification_report(dataset_d['test']['labels'], y_test_pred, digits=4))