# ESN for NER

## Librairies

Inspired from : https://huggingface.co/transformers/custom_datasets.html#tok-ner

In [1]:
# !pip install nltk
# !python -m spacy download en_core_web_sm
# !pip install datasets==1.7.0
# !pip install transformers==4.10.3
# !pip install seqeval
# !pip install pytorch-crf==0.7.2
# !pip install ipywidgets

In [2]:
import os
import sys

#sys.path.insert(0, os.path.abspath(".."))
sys.path.insert(0, os.path.abspath("../.."))

In [3]:
import io
import re
import pickle
import random
from functools import reduce
from timeit import default_timer as timer
from tqdm.autonotebook import tqdm

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
import matplotlib.pyplot as plt

from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer
from transformers import BertModel
from transformers import BatchEncoding
from transformers.data.data_collator import DataCollatorWithPadding, DataCollatorForTokenClassification
from datasets import load_metric

#from ax import optimize
#from ax.plot.contour import plot_contour
#from ax.plot.trace import optimization_trace_single_method
#from ax.service.managed_loop import optimize
#from ax.utils.notebook.plotting import render, init_notebook_plotting
# from ax.utils.tutorials.cnn_utils import load_mnist, train, evaluate, CNN
from matplotlib.ticker import MaxNLocator

import src.models.reservoir as res
import src.models.learning_algo as la
import src.models.merging_strategy as ms
import src.models.esn as esn
import src.models.baseline as bs
from src.utils.matrix import partial_merging
# from src.data.trec import *

plt.style.use('ggplot')

  from tqdm.autonotebook import tqdm


In [4]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch.device('cpu') #
device

device(type='cuda')

In [6]:
SEED = 42

## Global variables

In [7]:
DATA_FILE = '/opt/matterhorn/Data/NER/' #/raid/home/jeremiec/Data/persuasive_essays/output.csv'
RESULTS_PATH = '/opt/matterhorn/Data/NER/results'

CACHE_DIR = "../" #'/opt/matterhorn/Data/NER/datasets' 

## Dataset

In [8]:
# Needed to get BatchEncoding as batch type, like with DataCollatorWithPadding
class CustomDataCollator(DataCollatorForTokenClassification):
    def __init__(self, *args, **k_args):
        super().__init__(*args, **k_args)
        
    def torch_call(self, features):
        
        to_pad = ['offset_mapping', 'original_labels']
        padding_token = [[0, 0], -100]
        
        features_ = [{k: v for k, v in e.items() if k not in to_pad} for e in features]
        batch = super().torch_call(features_)
        
        # Do the padding of our custom field manually as the collator doesn't do it by default
        padding_size = len(batch["input_ids"][0])
        
        for field, pad_tok in zip(to_pad, padding_token):
            field_l = [e[field] for e in features]
            field_padding_l = [[pad_tok]*(padding_size - actual_len) for actual_len in [len(x) for x in field_l]]
            field_l = [field_l[i] + field_padding_l[i] for i in range(len(field_l))]
            batch[field] = torch.tensor(field_l, dtype=torch.int64) 
        
        return batch
        
    def __call__(self, features):
        batch = BatchEncoding(super().__call__(features))
        return batch

In [9]:
# Stolen from https://huggingface.co/transformers/custom_datasets.html
def encode_tags(batch, padding_label=-100):
    
    # tags, encodings
    
    labels = batch['original_labels']
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, batch['offset_mapping']):
        # create an empty array of `padding_label`
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * padding_label
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [10]:
# rename correct column as 'labels': depends on the dataset you load

def load_and_enrich_dataset(dataset_name, split, tokenizer, cache_dir):
    
    dataset = load_dataset(dataset_name, split=split, cache_dir=cache_dir)
    
    # d = pd.read_csv("toy_dataset.csv")
    # d.sentences = d.sentences.apply(lambda x: x[2:-2].split("', '"))
    # d.labels = d.labels.apply(lambda l: [int(x) for x in l[1:-1].split(", ")])
    # d.columns = ["tokens", "ner_tags"]
    # dataset = Dataset.from_pandas(d, split="train").train_test_split(test_size=0.2)
    
    def tokenize_ner(batch):
        # HACKING MODE ACTIVATED (Remove 0's entities)
        #mask = [np.argwhere(n).flatten().tolist() for n in batch["ner_tags"]]
        #batch['tokens'] = [[t[i] for i in m] for t,m in zip(batch['tokens'], mask) if len(m) > 0 ]
        #batch['ner_tags'] = [[n[i] for i in m] for n,m in zip(batch['ner_tags'], mask) if len(m) > 0]
        
        # Simplify labels
        #batch["ner_tags"] = [[int(x in [1, 2]) for x in l] for l in batch["ner_tags"]]
        
        batch_tokenized = tokenizer(batch['tokens'], truncation=True, padding=False, is_split_into_words=True, return_offsets_mapping=True)
        batch_tokenized["lengths"] = [ len(sample) for sample in batch_tokenized["input_ids"] ]
        batch_tokenized["original_labels"] = batch["ner_tags"]
        
        batch_tokenized["labels"] = encode_tags(batch_tokenized, padding_label=-100)

        return batch_tokenized
    
    # dataset = dataset.remove_columns(['id', 'chunk_tags', 'pos_tags']) # comment if toy dataset
    dataset = dataset.map(tokenize_ner, batched=True)
    
    # EITHER
    # dataset = dataset.remove_columns(['ner_tags', 'tokens']) # comment if toy dataset
    # OR
    dataset.set_format(type=None, columns=['input_ids', 'labels', 'original_labels', 'lengths', 'offset_mapping'])
    # IS STRICTLY NECESSARY FOR THE DATALOADER TO WORK
        
    return dataset

In [11]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', padding=True, truncation=True)

full_dataset = load_and_enrich_dataset('conll2003', split=None, tokenizer=tokenizer, cache_dir=CACHE_DIR)
#full_dataset = full_dataset.sort("lengths")  # BE CAREFUL: we are not sorting the offset mapping

Reusing dataset conll2003 (../conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [12]:
len(full_dataset['train']), len(full_dataset['test'])

(14041, 3453)

In [13]:
# encode_tags(full_dataset['train'][:4])

In [14]:
# for a, b in zip(tokenizer.convert_ids_to_tokens(full_dataset['train'][0]['input_ids']),  encode_tags(full_dataset['train'][: 1])[0]):
#     print(a, b)

### For "ner_tags":

#### 82.5% of the true dataset labels are 0. 

#### 84.8% when adding the [CLS] + [SEP]

### For "pos_tags":

#### 13.7% of the true dataset labels are 0 when adding the [CLS] + [SEP]

In [15]:
l = []
for x in full_dataset["train"]["labels"]:
    l.extend(x[:])
for x in full_dataset["validation"]["labels"]:
    l.extend(x[:])
for x in full_dataset["test"]["labels"]:
    l.extend(x[:])
    
l = np.array(l)

Missing labels '2' and '9' for "pos_tags"

In [16]:
l2 = pd.DataFrame(l)
l2 = l2[l2[0] != -100]
cross_entropy_weight = (1/l2.value_counts(normalize=True).sort_index()).values
cross_entropy_weight = torch.Tensor(cross_entropy_weight)

In [17]:
cross_entropy_weight

tensor([  1.2025,  29.9650,  43.1151,  32.3306,  56.9788,  28.3155, 180.3818,
         59.5452, 175.5492])

In [18]:
np.unique(l)

array([-100,    0,    1,    2,    3,    4,    5,    6,    7,    8])

In [19]:
len(np.unique(l))

10

In [20]:
pd.DataFrame(l)[0].value_counts()

 0      250660
-100    144074
 5       10645
 1       10059
 3        9323
 2        6991
 4        5290
 7        5062
 8        1717
 6        1671
Name: 0, dtype: int64

In [21]:
nb_labels = np.max(l)+1
nb_labels

9

In [22]:
print(f"percentage of 0 in the dataset: {100*(l == 0).sum()/len(l):.2f}%")

percentage of 0 in the dataset: 56.27%


## Create modular 

In [23]:
class EmbeddingLayer(nn.Module):


    def __init__(self, model_name='bert-base-uncased', device=torch.device('cpu')):
        super(EmbeddingLayer, self).__init__()
        
        self.model_name = model_name
        self.model = BertModel.from_pretrained(self.model_name, output_hidden_states=True)
        self.device = device
        self.model.to(self.device).eval()
        #print('BERT model downloaded:', model_name)

    def forward(self, batch):
        with torch.no_grad():

            # method 1: dynamic embedding
            # batch_tkn = batch["input_ids"].to(self.device)
            batch = batch.to(self.device)
            batch_emb = self.model(batch["input_ids"], batch["attention_mask"])[0]

        return batch_emb


In [54]:
class ModularESN(nn.Module):
    
    def __init__(self, reservoir_dim, output_size, device):
        super(ModularESN, self).__init__()

        self.reservoir_dim = reservoir_dim
        
        self.embedding = EmbeddingLayer(model_name='bert-base-uncased', device=device)
        self.reservoir = nn.RNN(input_size=768, hidden_size=reservoir_dim,
                                num_layers=1, nonlinearity='tanh', bias=True,
                                batch_first=True, dropout=0, bidirectional=False)
        self.output_linear = nn.Linear(in_features=reservoir_dim, out_features=output_size)

    def forward(self, batch):
        
        embedded_input = self.embedding(batch)
        reservoir_states, _ = self.reservoir(embedded_input)
        
        print('\n')
        print('states', reservoir_states.shape)
        print('lengths', batch["lengths"].shape)
        print('labels', batch["labels"].shape)
        
        flattened_states = reservoir_states.reshape((-1, self.reservoir_dim))
        flattened_labels = batch["labels"].reshape((-1))
        
        # Mask for padding, SEP, CLS and subtokens
        mask = (flattened_labels != -100) & (flattened_labels != -200)
        
        filtered_states = flattened_states[mask]
        filtered_labels = flattened_labels[mask]
        filtered_lengths = data["lengths"] - (data["labels"] == -100).sum(axis=1)
        
        filtered_outputs = self.output_linear(filtered_states)
        
        print('\n')
        print('filtered_outputs', filtered_outputs.shape)
        print('filtered_lengths', filtered_lengths.shape)
        print('filtered_labels', filtered_labels.shape)
        
        return filtered_outputs, filtered_lengths, filtered_labels

In [55]:
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [56]:
# dataloader
dataloader_d = {}

for k, v in full_dataset.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=128, shuffle=True, collate_fn=CustomDataCollator(tokenizer, label_pad_token_id=-200))  # NB: Used to be = -100

In [57]:
model = ModularESN(reservoir_dim=1000, output_size=nb_labels, device=device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [58]:
model.to(device)
pass

In [59]:
criterion = nn.CrossEntropyLoss(cross_entropy_weight)
criterion.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01, betas=(0.9, 0.999))

In [60]:
model.train()

for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in tqdm(enumerate(dataloader_d["train"]), total=len(dataloader_d['train'])):
        
        data.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, lengths, labels = model(data)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 20 == 19:   
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 20:.3f}')
            running_loss = 0.0

print('Finished Training')

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))



states torch.Size([128, 54, 1000])
lengths torch.Size([128])
labels torch.Size([128, 54])


filtered_outputs torch.Size([1801, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([1801])


states torch.Size([128, 60, 1000])
lengths torch.Size([128])
labels torch.Size([128, 60])


filtered_outputs torch.Size([2072, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([2072])


states torch.Size([128, 52, 1000])
lengths torch.Size([128])
labels torch.Size([128, 52])


filtered_outputs torch.Size([2031, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([2031])


states torch.Size([128, 58, 1000])
lengths torch.Size([128])
labels torch.Size([128, 58])


filtered_outputs torch.Size([1994, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([1994])


states torch.Size([128, 58, 1000])
lengths torch.Size([128])
labels torch.Size([128, 58])


filtered_outputs torch.Size([1903, 9])
filtered_lengths torch.Size([128])
filtered_labels torc

HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))



states torch.Size([128, 73, 1000])
lengths torch.Size([128])
labels torch.Size([128, 73])


filtered_outputs torch.Size([2063, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([2063])


states torch.Size([128, 52, 1000])
lengths torch.Size([128])
labels torch.Size([128, 52])


filtered_outputs torch.Size([1916, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([1916])


states torch.Size([128, 66, 1000])
lengths torch.Size([128])
labels torch.Size([128, 66])


filtered_outputs torch.Size([1895, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([1895])


states torch.Size([128, 63, 1000])
lengths torch.Size([128])
labels torch.Size([128, 63])


filtered_outputs torch.Size([2022, 9])
filtered_lengths torch.Size([128])
filtered_labels torch.Size([2022])


states torch.Size([128, 62, 1000])
lengths torch.Size([128])
labels torch.Size([128, 62])


filtered_outputs torch.Size([1846, 9])
filtered_lengths torch.Size([128])
filtered_labels torc

In [61]:
print(classification_report(labels.cpu().numpy(), outputs.argmax(dim=1).cpu().numpy(), digits=3))

              precision    recall  f1-score   support

           0      0.968     0.464     0.627       973
           1      0.282     0.453     0.348        53
           2      0.188     0.585     0.284        41
           3      0.158     0.340     0.216        47
           4      0.114     0.385     0.175        26
           5      0.237     0.548     0.331        42
           6      0.111     0.700     0.192        10
           7      0.099     0.400     0.158        20
           8      0.028     1.000     0.055         3

    accuracy                          0.466      1215
   macro avg      0.243     0.542     0.265      1215
weighted avg      0.813     0.466     0.555      1215



In [282]:
# *** NEW ***
# Recreate the dataloaders with shuffle=False in order to ba able to compute the predictions correcly
# Otherwise, everything is re-shuffled every time we iterate through the dataloader!!!
dataloader_d = {}

for k, v in full_dataset.items():
    dataloader_d[k] = torch.utils.data.DataLoader(v, batch_size=128, shuffle=False, collate_fn=CustomDataCollator(tokenizer, label_pad_token_id=-100))  # NB: Used to be = -100

In [283]:
model.eval()
predictions_l = []
labels_l = []

for data in tqdm(dataloader_d["test"]):
    with torch.no_grad():
        data.to(device)
        
        labels = data["labels"].cpu().numpy()
        lengths = data["lengths"]

        # forward + backward + optimize
        outputs = model(data, lengths)

        argmax_output = outputs.cpu().numpy().argmax(axis=1)
        
        begin_i = 0 
        for i,l in enumerate(lengths):
            predictions_l.append(argmax_output[begin_i:begin_i+l])
            labels_l.append(labels[i][:l])
            begin_i += l

HBox(children=(FloatProgress(value=0.0, max=27.0), HTML(value='')))




TypeError: forward() takes 2 positional arguments but 3 were given

In [None]:
# Outside of the loop
if self.merging_strategy.merging_strategy is None :
    predictions_l = [np.array(x) for x in predictions_l]
else:
    predictions_l = np.concatenate([x for x in predictions_l], axis=0)


In [36]:
# preds is a list of tensors: 1 prediction tensor per batch

preds_l = ESN.predict(dataloader_d["test"])

HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




In [37]:
labels_l = []

for b in dataloader_d["test"]:
    batch_labels = b["original_labels"].cpu().detach().numpy()
    for s_labels in batch_labels:
        labels_l.append(s_labels[s_labels != -100])

In [38]:
metric = load_metric("seqeval")

In [39]:
# O (0), B-PER (1), I-PER (2), B-ORG (3), I-ORG (4) B-LOC (5), I-LOC (6) B-MISC (7), I-MISC (8).

In [40]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [41]:
true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(preds_l, labels_l)
        ]

true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(preds_l, labels_l)
        ]

In [42]:
results = metric.compute(predictions=true_predictions, references=true_labels)

In [43]:
results

{'LOC': {'precision': 0.7968659315147998,
  'recall': 0.8231414868105515,
  'f1': 0.8097906222353288,
  'number': 1668},
 'MISC': {'precision': 0.6449375866851595,
  'recall': 0.6623931623931624,
  'f1': 0.6535488404778637,
  'number': 702},
 'ORG': {'precision': 0.6913783635365184,
  'recall': 0.7579771222155328,
  'f1': 0.7231476163124642,
  'number': 1661},
 'PER': {'precision': 0.920049968769519,
  'recall': 0.9109461966604824,
  'f1': 0.9154754505904288,
  'number': 1617},
 'overall_precision': 0.7790658029321513,
 'overall_recall': 0.8091359773371105,
 'overall_f1': 0.7938162237276359,
 'overall_accuracy': 0.9656293743943146}

In [44]:
flattened_preds = []
for l in true_predictions:
    for p in l:
        flattened_preds.append(p)
        
flattened_labels = []
for l in true_labels:
    for p in l:
        flattened_labels.append(p)
        
print(classification_report(flattened_labels, flattened_preds))

              precision    recall  f1-score   support

       B-LOC       0.85      0.85      0.85      1668
      B-MISC       0.78      0.69      0.73       702
       B-ORG       0.81      0.80      0.80      1661
       B-PER       0.95      0.93      0.94      1617
       I-LOC       0.74      0.64      0.69       257
      I-MISC       0.60      0.61      0.60       216
       I-ORG       0.81      0.82      0.81       835
       I-PER       0.98      0.97      0.97      1156
           O       0.99      0.99      0.99     38323

    accuracy                           0.97     46435
   macro avg       0.83      0.81      0.82     46435
weighted avg       0.97      0.97      0.97     46435

