In [None]:
!pip install datasets
!pip install tokenizers
!pip install transformers

<h3><b>NER on Bengali Language</b></h3>
<h3>Things to notice</h3>
<b>
1. The model checkpoint should be mbert</br>
2. We need to use mbert cased version (new, recommended) not the uncased version ( refer to https://github.com/google-research/bert/blob/master/multilingual.md )</br>
3. In the uncased version, there are normalization issues for bengali, which I saw as the spelling of input word and decoded word is different</br>
4. The train loss is greater than eval loss in very initial stages of train, after which they start behaving "normally"</br>
5. seqeval python module for ner evaluation entity level
</br>
6. Due to the padding done with DataCollator, the output "labels" of prediction would have lot of -100 values which are to get discarded so as to focus on "actual" input token and "actual corresponding output value"</br>
7. My initial understanding of adjusting labels after tokenization is wrong, if word="XYZ" and token="XY" and "##Z" and original label for word="B-LOC", then adjusted labels would be "B-LOC", "B-LOC" and not "B-LOC","I-LOC"
<h2><b>Step 1 : Load the dataset</b></h2>

In [2]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "bn")

Downloading:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading and preparing dataset wikiann/bn (download: 223.17 MiB, generated: 2.70 MiB, post-processed: Unknown size, total: 225.86 MiB) to /root/.cache/huggingface/datasets/wikiann/bn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e...


Downloading:   0%|          | 0.00/234M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset wikiann downloaded and prepared to /root/.cache/huggingface/datasets/wikiann/bn/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

<h2><b>Step 2 : Get list of label names </b></h2>

In [3]:
label_names = dataset["train"].features["ner_tags"].feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

<h2><b>Step 3 : Tokenize dataset and adjust the labels</b></h2>
<ul>
  <li>encode method returns the required keys (input_ids, token_type_ids, attention_mask) but its not a Pytorch/hf dataset</li>
  <li>map allows adding new keys to existing splits of the hf dataset, so that's why map is used, no need to make a new dataset from the output of encode</li>
  <li>adjustment of labels needed as token like "Johnemma" will be split into "john" and "##emma" but label for it would still be "B-PER", to align it "john" will get label "B-PER" and ##emma gets label "B-PER"</li>
</ul>

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True)
  #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
  #so the new keys [input_ids, labels (after adjustment)]
  #can be added to the datasets dict for each train test validation split
  total_adjusted_labels = []
  print(len(tokenized_samples["input_ids"]))
  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []
    '''print(word_ids_list)
    print(existing_label_ids)
    print(all_samples_per_split["tokens"][k])
    print(tokenized_samples["input_ids"][k])'''
    for wid in word_ids_list:
      if(wid is None):
        adjusted_label_ids.append(-100)
      elif(wid!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = wid
      else:
        label_name = label_names[existing_label_ids[i]]
        '''if(label_name == "O"):
          adjusted_label_ids.append(existing_label_ids[i])
        elif(label_name[0:2]=="B-"):
          adjusted_label_ids.append(label_names.index(label_name.replace("B-","I-")))
        else:
          adjusted_label_ids.append(existing_label_ids[i])'''
        adjusted_label_ids.append(existing_label_ids[i])
        
    total_adjusted_labels.append(adjusted_label_ids)
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

1000


  0%|          | 0/1 [00:00<?, ?ba/s]

1000


  0%|          | 0/10 [00:00<?, ?ba/s]

1000
1000
1000
1000
1000
1000
1000
1000
1000
1000


In [5]:
tokenized_dataset["validation"][2]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101,
  978,
  12235,
  38044,
  40349,
  52245,
  950,
  21790,
  12079,
  89362,
  77045,
  117,
  978,
  12235,
  38044,
  40349,
  102],
 'labels': [-100, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 5, 5, 5, 5, -100],
 'langs': ['bn', 'bn', 'bn', 'bn', 'bn'],
 'ner_tags': [5, 6, 6, 0, 5],
 'spans': ['LOC: সিডনি ক্রিকেট গ্রাউন্ড', 'LOC: সিডনি'],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'tokens': ['সিডনি', 'ক্রিকেট', 'গ্রাউন্ড', ',', 'সিডনি']}

<h2><b>Step 4 : Pad the samples per split</b></h2>
<ul>
  <li>Each token list per sample will be split</li>
  <li>Sample x and sample y may not have same length so padding is needed
  <li>This will be used by Trainer API, this is the collate_fn equivalent from pytorch</li>
</ul>

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

<h2><b>Step 5 : Set up integration with Weights and Biases </b></h2>

In [7]:
!pip install wandb
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     |████████████████████████████████| 43 kB 197 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=e97a3acb615b55f02551d30c7dd53d609133960fe74ef0ba450f45c1

In [8]:
import os
import wandb
os.environ["WANDB_API_KEY"]="API KEY GOES HERE"
os.environ["WANDB_ENTITY"]="Suchandra"
os.environ["WANDB_PROJECT"]="ner_project"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<h2><b>Step 6 : Load model, define training_args, train</b></h2>

In [9]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./jan_14_all_2022",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps = 1000,
    report_to="wandb",
    run_name = "ep_10_tokenized_11",
    save_strategy='no'
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
wandb.finish()

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Loc F1,Org F1,Per F1
1000,0.3366,0.210477,0.940126,0.941262,0.940693,0.954287,0.934716,0.92431,0.961162
2000,0.1098,0.169222,0.954078,0.964225,0.959125,0.967807,0.956601,0.938577,0.9788
3000,0.0447,0.144407,0.968457,0.972202,0.970326,0.977293,0.968137,0.956946,0.983772
4000,0.0205,0.132434,0.968901,0.971477,0.970187,0.977965,0.969212,0.956831,0.982079


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: spans, ner_tags, tokens, langs.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: spans, ner_tags, tokens, langs.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: spans, ner_tags, tokens, langs.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: spans, ner_tags, tokens, langs.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


Training c

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/LOC_f1,▁▅██
eval/ORG_f1,▁▄██
eval/PER_f1,▁▆█▇
eval/loss,█▄▂▁
eval/overall_accuracy,▁▅██
eval/overall_f1,▁▅██
eval/overall_precision,▁▄██
eval/overall_recall,▁▆██
eval/runtime,▂▆▁█
eval/samples_per_second,▇▂█▁

0,1
eval/LOC_f1,0.96921
eval/ORG_f1,0.95683
eval/PER_f1,0.98208
eval/loss,0.13243
eval/overall_accuracy,0.97797
eval/overall_f1,0.97019
eval/overall_precision,0.9689
eval/overall_recall,0.97148
eval/runtime,2.2492
eval/samples_per_second,444.598


<h2><b>Step 7 : Check performance on test set</b></h2>

In [10]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
#Here we will see that the labels list will have lots of -100 in them however the corresponding label of the 
#tokenized_dataset doesnt have it, the reason is during DataCollator padding step, all padding tokens are added
#and assigned labels of -100 to get "ignored" in future computation of evaluation

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: spans, ner_tags, tokens, langs.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


{'LOC': {'precision': 0.9700199866755497,
  'recall': 0.9642384105960264,
  'f1': 0.967120557954168,
  'number': 1510},
 'ORG': {'precision': 0.9548335974643423,
  'recall': 0.9725585149313963,
  'f1': 0.9636145541783286,
  'number': 1239},
 'PER': {'precision': 0.9716629381058911,
  'recall': 0.9702159344750558,
  'f1': 0.9709388971684053,
  'number': 1343},
 'overall_precision': 0.9658869395711501,
 'overall_recall': 0.9687194525904204,
 'overall_f1': 0.96730112249878,
 'overall_accuracy': 0.9744194618503501}

<h2><b>Step 8 : Save model for future use</b></h2>

In [11]:
model.save_pretrained("jan_14_all_2022")

Configuration saved in jan_14_all_2022/config.json
Model weights saved in jan_14_all_2022/pytorch_model.bin


In [12]:
saved_model = AutoModelForTokenClassification.from_pretrained("jan_14_all_2022")
saved_model

loading configuration file jan_14_all_2022/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_h

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

<h2><b>Step 9 : Predict on random sentences, no longer using Trainer API, model inputs expected to be tensors</b></h2>

In [19]:
import torch
random_sentence_from_internet = [ "মারভিন", "দি", "মারসিয়ান" ]  
input = tokenizer(random_sentence_from_internet, is_split_into_words=True, return_tensors='pt')
print(input)
output = saved_model(**input)
predictions = torch.nn.functional.softmax(output.logits, dim=-1)
predictions = predictions.detach().numpy()
predictions = np.argmax(predictions, axis=2)
print(predictions)

pred_names = [label_names[p] for p in predictions[0]]
for index, id in enumerate(input["input_ids"][0]):
  print("\nID: ", id, "Decoded ID: ", tokenizer.decode(id), "\tPred: ", pred_names[index])

{'input_ids': tensor([[  101, 18601, 11128, 80045, 11737,   965, 12235, 18601, 11128, 45733,
         96032,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[[2 1 1 1 1 2 2 2 2 2 2 2]]

ID:  tensor(101) Decoded ID:  [CLS] 	Pred:  I-PER

ID:  tensor(18601) Decoded ID:  মা 	Pred:  B-PER

ID:  tensor(11128) Decoded ID:  ##র 	Pred:  B-PER

ID:  tensor(80045) Decoded ID:  ##ভি 	Pred:  B-PER

ID:  tensor(11737) Decoded ID:  ##ন 	Pred:  B-PER

ID:  tensor(965) Decoded ID:  দ 	Pred:  I-PER

ID:  tensor(12235) Decoded ID:  ##ি 	Pred:  I-PER

ID:  tensor(18601) Decoded ID:  মা 	Pred:  I-PER

ID:  tensor(11128) Decoded ID:  ##র 	Pred:  I-PER

ID:  tensor(45733) Decoded ID:  ##সি 	Pred:  I-PER

ID:  tensor(96032) Decoded ID:  ##য়ান 	Pred:  I-PER

ID:  tensor(102) Decoded ID:  [SEP] 	Pred:  I-PER
