# Model **B** development notebook
The notebook contains the data preparation and model training for setup **B**

In [1]:
!pip install datasets
!pip install seqeval
!pip install accelerate -U



Restart notebook

In [2]:
import itertools

import datasets
import transformers
import torch

import numpy as np

from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoTokenizer, TrainingArguments, AutoModelForTokenClassification, Trainer
from transformers import trainer_utils, EarlyStoppingCallback, DataCollatorForTokenClassification
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from seqeval.scheme import IOB1

## Load dataset from the Huggingface repository.

In [3]:
dataset = load_dataset('Babelscape/multinerd')
dataset.shape

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

{'train': (2678400, 3), 'validation': (334800, 3), 'test': (335986, 3)}

### Use the predefined train, validation, and test splits.

In [4]:
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']
train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 2678400
})

### Preprocess the dataset.

In [5]:
# Select the english samples
train_dataset = train_dataset.filter(lambda example: example["lang"].startswith("en"))
validation_dataset = validation_dataset.filter(lambda example: example["lang"].startswith("en"))
test_dataset = test_dataset.filter(lambda example: example["lang"].startswith("en"))
train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 262560
})

In [6]:
# Select for the labels that are to be present in model B
key2index = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-DIS": 13,
    "I-DIS": 14
  }

# Map old labels to new
oldindex2index = {
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
    6: 6,
    7: 7,
    8: 8,
    13: 9,
    14: 10
  }

# Create the tag to index and index to tag dictionaries to use later in model training and evaluation
index2key = {value: list(key2index.keys())[value] for value in oldindex2index.values()}
new_key2index = {value: key for key, value in index2key.items()}

In [7]:
# Change the labels that are to be excluded with 0
train_dataset = train_dataset.map(lambda x: {'ner_tags': [i if i in key2index.values() else 0 for i in x['ner_tags'] ]})
validation_dataset = validation_dataset.map(lambda x: {'ner_tags': [i if i in key2index.values() else 0 for i in x['ner_tags'] ]})
test_dataset = test_dataset.map(lambda x: {'ner_tags': [i if i in key2index.values() else 0 for i in x['ner_tags'] ]})

# Change the remaining label indices with new ones
train_dataset = train_dataset.map(lambda x: {'ner_tags': [oldindex2index[i] for i in x['ner_tags']]})
validation_dataset = validation_dataset.map(lambda x: {'ner_tags': [oldindex2index[i] for i in x['ner_tags']]})
test_dataset = test_dataset.map(lambda x: {'ner_tags': [oldindex2index[i] for i in x['ner_tags']]})

In [8]:
# Remove any samples that do not have named entity labels
train_dataset = train_dataset.filter(lambda example: any(x != 0 for x in example["ner_tags"]))
validation_dataset = validation_dataset.filter(lambda example: any(x != 0 for x in example["ner_tags"]))
test_dataset = test_dataset.filter(lambda example: any(x != 0 for x in example["ner_tags"]))
train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 221446
})

## Load the model.
The model in this assignment will be the english version of RoBERTa-base. This model has both a workable size for the resources available and is pretrained using masked language modeling on the English language which makes it a good choice for the task.

In [9]:
# Load the tokenizer, define max the max available length of the model based and the padding strategy
tokenizer = tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    model_max_length=512,
    max_len=512,
    truncation=True,
    add_prefix_space=True,
    padding='Longest')
# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(list(index2key.keys())),
    label2id=new_key2index,
    id2label=index2key)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define some helper functions.

In [10]:
def preprocess_data(examples):
  """
  Processes text with the model tokenizer
  """
  text = examples['tokens']
  encodings = tokenizer(
      text,
      is_split_into_words=True,
      return_offsets_mapping=True,
      padding=True,
      truncation=True
      )
  return encodings

def encode_tags(examples):
  """
  Processes the labels for the task, setting -100 as a label for the subtokens
  and special tokens added from the tokenizer. Using -100 will indicate to the
  loss function that these tokens should not contribute to the computation of
  the loss function.
  """
  labels = examples['ner_tags']
  mapping = examples['offset_mapping']
  enc_labels = np.ones(len(mapping), dtype=int) * -100
  arr_offset = np.array(mapping)
  try:
      enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = labels
  except:
      label_idx = 0
      for i in range(arr_offset.shape[0]):
          if (arr_offset[i, 0] == 0) & (arr_offset[i, 1] != 0):
              try:
                  enc_labels[i] = labels[label_idx]
                  label_idx += 1
              except:
                  pass
  examples['labels'] = enc_labels.tolist()
  return examples

def align_predictions(predictions: np.ndarray, label_ids: np.ndarray):
  """
  Helper function to align the predictions and labels, removing the ignored
  tokens.
  """
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  out_label_list = [[] for _ in range(batch_size)]
  preds_list = [[] for _ in range(batch_size)]
  for i in range(batch_size):
      for j in range(seq_len):
          if label_ids[i, j] != torch.nn.CrossEntropyLoss().ignore_index:
              out_label_list[i].append(index2key[label_ids[i][j]])
              preds_list[i].append(index2key[preds[i][j]])
  return preds_list, out_label_list

def compute_metrics(p):
  """
  Helper function to compute the metrics during the training process.
  """
  Y_pred, Y_true = align_predictions(p.predictions, p.label_ids)

  return {
      "precision": precision_score(Y_true, Y_pred),
      "recall": recall_score(Y_true, Y_pred),
      "f1-IOB": f1_score(Y_true, Y_pred, scheme=IOB1, average='weighted')
  }

def get_device(dev=None):
  """
  Helper function to set the device for the evaluation function.
  """
  if dev is None:
      if torch.cuda.is_available():
          return 'cuda:0'
      else:
          return 'cpu'
  else:
      return dev

@torch.no_grad()
def evaluate(model, collator: DataCollatorForTokenClassification, dataset: Dataset):
  """
  Function to evaluate the test set.
  """
  dev = get_device('cpu')
  model = model.to(dev)
  test_dataloader = DataLoader(dataset, collate_fn=collator, batch_size=32,
                                drop_last=True)
  logits, label_ids = [], []
  for sample in tqdm(test_dataloader, desc=f'Evaluation'):
      sample = {k: v.to(dev) for k, v in sample.items()}
      logits.append(model(**sample).logits.to('cpu').numpy())
      label_ids.append(sample['labels'].to('cpu').numpy())

  predictions, labels = align_eval_predictions(logits, label_ids)
  return predictions, labels

def align_eval_predictions(logits: list, label_ids: list):
  """
  Helper function to align the predictions and labels, removing the ignored
  tokens. It will return flat lists of the labels and corresponding predictions.
  """
  predictions, labels = [], []
  for batch in range(len(logits)):
      for sample_logits, sample_label_ids in zip(logits[batch], label_ids[batch]):
          sample_prediction_ids = np.argmax(sample_logits, axis=1)
          sample_predictions, sample_labels = [], []
          for prediction_id, label_id in zip(sample_prediction_ids, sample_label_ids):
              if label_id != torch.nn.CrossEntropyLoss().ignore_index:
                  sample_predictions.append(index2key[prediction_id])
                  sample_labels.append(index2key[label_id])
          predictions.append(sample_predictions)
          labels.append(sample_labels)
  return predictions, labels

In [11]:
# Preprocessing the data with the tokenizer
train_dataset = train_dataset.map(preprocess_data, batched=False, remove_columns=['tokens', 'lang'])
validation_dataset = validation_dataset.map(preprocess_data, batched=False, remove_columns=['tokens', 'lang'])
test_dataset = test_dataset.map(preprocess_data, batched=False, remove_columns=['tokens', 'lang'])

# Preprocessing the tags
train_dataset = train_dataset.map(encode_tags, batched=False, remove_columns=['ner_tags', 'offset_mapping'])
validation_dataset = validation_dataset.map(encode_tags, batched=False, remove_columns=['ner_tags', 'offset_mapping'])
test_dataset = test_dataset.map(encode_tags, batched=False, remove_columns=['ner_tags', 'offset_mapping'])
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 221446
})

## Define the Trainer and the arguments that will be used in training.

In [12]:
# Collator to use for serving batches to the model
collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./finetuned_model',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=64,
    save_total_limit=1,
    learning_rate=3e-5,
    evaluation_strategy=trainer_utils.IntervalStrategy.EPOCH,
    save_strategy=trainer_utils.IntervalStrategy.EPOCH,
    load_best_model_at_end=True,
    lr_scheduler_type=transformers.SchedulerType.CONSTANT
    )

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    callbacks=[EarlyStoppingCallback],
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics
    )

## Train the model

In [13]:
trainer.train()

# Save the best model
trainer.save_model(output_dir='./final_model')

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-iob
0,0.0191,0.025449,0.943466,0.957941,0.952264
1,0.0119,0.025271,0.944735,0.965731,0.957043
2,0.0075,0.030214,0.957632,0.951728,0.953854


## Evaluate the model on the test set

In [14]:
# Evaluate the best model on the test set
predictions, labels = evaluate(model=trainer.model,
                               collator=collator,
                               dataset=test_dataset)

# Compute the classification report on the test set
report = classification_report(labels, predictions)
# Save the classification report
with open('./test_results.txt', 'w') as handle:
  print(report, file=handle)

print(report)

Evaluation: 100%|██████████| 922/922 [1:19:10<00:00,  5.15s/it]


              precision    recall  f1-score   support

        ANIM       0.68      0.84      0.75      3208
         DIS       0.71      0.81      0.76      1518
         LOC       0.99      0.99      0.99     24043
         ORG       0.96      0.99      0.98      6616
         PER       0.99      1.00      0.99     10529

   micro avg       0.95      0.97      0.96     45914
   macro avg       0.87      0.93      0.89     45914
weighted avg       0.96      0.97      0.97     45914



In [15]:
!nvidia-smi

Mon Dec  4 21:27:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    31W /  70W |  11453MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Save a dictionary with the test set labels and predictions.

In [16]:
import pickle as pkl

outputs = {
  'predictions': predictions,
  'labels': labels,
  'key2index': new_key2index,
  'index2key': index2key
}

with open('./model_outputs.pkl', 'wb') as handle:
  pkl.dump(outputs, handle)