# Model **A** development notebook
The notebook contains the data preparation and model training for setup **A**

In [1]:
!pip install datasets
!pip install seqeval
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━

Restart notebook

In [1]:
import itertools

import datasets
import transformers
import torch

import numpy as np

from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import load_dataset
from datasets import Dataset
from transformers import AutoTokenizer, TrainingArguments, AutoModelForTokenClassification, Trainer
from transformers import trainer_utils, EarlyStoppingCallback, DataCollatorForTokenClassification
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from seqeval.scheme import IOB1

## Load dataset from the Huggingface repository.

In [2]:
dataset = load_dataset('Babelscape/multinerd')
dataset.shape

Downloading readme:   0%|          | 0.00/5.66k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/32.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/50.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.04M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.18M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.86M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.51M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.47M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'train': (2678400, 3), 'validation': (334800, 3), 'test': (335986, 3)}

### Use the predefined train, validation, and test splits.

In [3]:
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']
train_dataset

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 2678400
})

### Preprocess the dataset.

In [4]:
# Select the english samples
train_dataset = train_dataset.filter(lambda example: example["lang"].startswith("en"))
validation_dataset = validation_dataset.filter(lambda example: example["lang"].startswith("en"))
test_dataset = test_dataset.filter(lambda example: example["lang"].startswith("en"))
train_dataset

Filter:   0%|          | 0/2678400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/334800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/335986 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 262560
})

In [5]:
# Remove any samples that do not have named entity labels
train_dataset = train_dataset.filter(lambda example: any(x != 0 for x in example["ner_tags"]))
validation_dataset = validation_dataset.filter(lambda example: any(x != 0 for x in example["ner_tags"]))
test_dataset = test_dataset.filter(lambda example: any(x != 0 for x in example["ner_tags"]))
train_dataset

Filter:   0%|          | 0/262560 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32820 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32908 [00:00<?, ? examples/s]

Dataset({
    features: ['tokens', 'ner_tags', 'lang'],
    num_rows: 262492
})

In [6]:
# Define the labels based on the dataset description from the repository
key2index = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }
# Check which labels are still present in the current dataset after preprocessing
tag_indices_list = set(list(itertools.chain(*train_dataset['ner_tags'])))
# Create the tag to index and index to tag dictionaries to use later in model training and evaluation
key2index = {key: value for key, value in key2index.items() if value in tag_indices_list}
index2key = {value: key for key, value in key2index.items()}
key2index

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-ANIM': 7,
 'I-ANIM': 8,
 'B-BIO': 9,
 'I-BIO': 10,
 'B-CEL': 11,
 'I-CEL': 12,
 'B-DIS': 13,
 'I-DIS': 14,
 'B-EVE': 15,
 'I-EVE': 16,
 'B-FOOD': 17,
 'I-FOOD': 18,
 'B-INST': 19,
 'I-INST': 20,
 'B-MEDIA': 21,
 'I-MEDIA': 22,
 'B-MYTH': 23,
 'I-MYTH': 24,
 'B-PLANT': 25,
 'I-PLANT': 26,
 'B-TIME': 27,
 'I-TIME': 28,
 'B-VEHI': 29,
 'I-VEHI': 30}

## Load the model.
The model in this assignment will be the english version of RoBERTa-base. This model has both a workable size for the resources available and is pretrained using masked language modeling on the English language which makes it a good choice for the task.

In [7]:
# Load the tokenizer, define max the max available length of the model based and the padding strategy
tokenizer = tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base",
    model_max_length=512,
    max_len=512,
    truncation=True,
    add_prefix_space=True,
    padding='Longest')
# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(list(index2key.keys())),
    label2id=key2index,
    id2label=index2key)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define some helper functions.

In [8]:
def preprocess_data(examples):
  """
  Processes text with the model tokenizer
  """
  text = examples['tokens']
  encodings = tokenizer(
      text,
      is_split_into_words=True,
      return_offsets_mapping=True,
      padding=True,
      truncation=True
      )
  return encodings

def encode_tags(examples):
  """
  Processes the labels for the task, setting -100 as a label for the subtokens
  and special tokens added from the tokenizer. Using -100 will indicate to the
  loss function that these tokens should not contribute to the computation of
  the loss function.
  """
  labels = examples['ner_tags']
  mapping = examples['offset_mapping']
  enc_labels = np.ones(len(mapping), dtype=int) * -100
  arr_offset = np.array(mapping)
  try:
      enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = labels
  except:
      label_idx = 0
      for i in range(arr_offset.shape[0]):
          if (arr_offset[i, 0] == 0) & (arr_offset[i, 1] != 0):
              try:
                  enc_labels[i] = labels[label_idx]
                  label_idx += 1
              except:
                  pass
  examples['labels'] = enc_labels.tolist()
  return examples

def align_predictions(predictions: np.ndarray, label_ids: np.ndarray):
  """
  Helper function to align the predictions and labels, removing the ignored
  tokens.
  """
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  out_label_list = [[] for _ in range(batch_size)]
  preds_list = [[] for _ in range(batch_size)]
  for i in range(batch_size):
      for j in range(seq_len):
          if label_ids[i, j] != torch.nn.CrossEntropyLoss().ignore_index:
              out_label_list[i].append(index2key[label_ids[i][j]])
              preds_list[i].append(index2key[preds[i][j]])
  return preds_list, out_label_list

def compute_metrics(p):
  """
  Helper function to compute the metrics during the training process.
  """
  Y_pred, Y_true = align_predictions(p.predictions, p.label_ids)

  return {
      "precision": precision_score(Y_true, Y_pred),
      "recall": recall_score(Y_true, Y_pred),
      "f1-IOB": f1_score(Y_true, Y_pred, scheme=IOB1, average='weighted')
  }

def get_device(dev=None):
  """
  Helper function to set the device for the evaluation function.
  """
  if dev is None:
      if torch.cuda.is_available():
          return 'cuda:0'
      else:
          return 'cpu'
  else:
      return dev

@torch.no_grad()
def evaluate(model, collator: DataCollatorForTokenClassification, dataset: Dataset):
  """
  Function to evaluate the test set.
  """
  dev = get_device('cpu')
  model = model.to(dev)
  test_dataloader = DataLoader(dataset, collate_fn=collator, batch_size=32,
                                drop_last=True)
  logits, label_ids = [], []
  for sample in tqdm(test_dataloader, desc=f'Evaluation'):
      sample = {k: v.to(dev) for k, v in sample.items()}
      logits.append(model(**sample).logits.to('cpu').numpy())
      label_ids.append(sample['labels'].to('cpu').numpy())

  predictions, labels = align_eval_predictions(logits, label_ids)
  return predictions, labels

def align_eval_predictions(logits: list, label_ids: list):
  """
  Helper function to align the predictions and labels, removing the ignored
  tokens. It will return flat lists of the labels and corresponding predictions.
  """
  predictions, labels = [], []
  for batch in range(len(logits)):
      for sample_logits, sample_label_ids in zip(logits[batch], label_ids[batch]):
          sample_prediction_ids = np.argmax(sample_logits, axis=1)
          sample_predictions, sample_labels = [], []
          for prediction_id, label_id in zip(sample_prediction_ids, sample_label_ids):
              if label_id != torch.nn.CrossEntropyLoss().ignore_index:
                  sample_predictions.append(index2key[prediction_id])
                  sample_labels.append(index2key[label_id])
          predictions.append(sample_predictions)
          labels.append(sample_labels)
  return predictions, labels

In [9]:
# Preprocessing the data with the tokenizer
train_dataset = train_dataset.map(preprocess_data, batched=False, remove_columns=['tokens', 'lang'])
validation_dataset = validation_dataset.map(preprocess_data, batched=False, remove_columns=['tokens', 'lang'])
test_dataset = test_dataset.map(preprocess_data, batched=False, remove_columns=['tokens', 'lang'])

# Preprocessing the tags
train_dataset = train_dataset.map(encode_tags, batched=False, remove_columns=['ner_tags', 'offset_mapping'])
validation_dataset = validation_dataset.map(encode_tags, batched=False, remove_columns=['ner_tags', 'offset_mapping'])
test_dataset = test_dataset.map(encode_tags, batched=False, remove_columns=['ner_tags', 'offset_mapping'])
train_dataset

Map:   0%|          | 0/262492 [00:00<?, ? examples/s]

Map:   0%|          | 0/32810 [00:00<?, ? examples/s]

Map:   0%|          | 0/32892 [00:00<?, ? examples/s]

Map:   0%|          | 0/262492 [00:00<?, ? examples/s]

Map:   0%|          | 0/32810 [00:00<?, ? examples/s]

Map:   0%|          | 0/32892 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 262492
})

## Define the Trainer and the arguments that will be used in training.

In [10]:
# Collator to use for serving batches to the model
collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./finetuned_model',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=64,
    save_total_limit=1,
    learning_rate=3e-5,
    evaluation_strategy=trainer_utils.IntervalStrategy.EPOCH,
    save_strategy=trainer_utils.IntervalStrategy.EPOCH,
    load_best_model_at_end=True,
    lr_scheduler_type=transformers.SchedulerType.CONSTANT
    )

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    callbacks=[EarlyStoppingCallback],
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics
    )

## Train the model

In [11]:
trainer.train()

# Save the best model
trainer.save_model(output_dir='./final_model')

# Evaluate the best model on the test set
predictions, labels = evaluate(model=trainer.model,
                               collator=collator,
                               dataset=test_dataset)

# Compute the classification report on the test set
report = classification_report(labels, predictions)
# Save the classification report
with open('./test_results.txt', 'w') as handle:
  print(report, file=handle)

print(report)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-iob
0,0.0321,0.044664,0.892266,0.910871,0.903068
2,0.0222,0.046516,0.911489,0.911524,0.91119


Evaluation: 100%|██████████| 1027/1027 [55:05<00:00,  3.22s/it]


              precision    recall  f1-score   support

        ANIM       0.69      0.77      0.73      3208
         BIO       0.60      0.38      0.46        16
         CEL       0.73      0.80      0.77        82
         DIS       0.67      0.84      0.74      1517
         EVE       0.89      0.95      0.92       704
        FOOD       0.62      0.63      0.63      1132
        INST       0.53      0.75      0.62        24
         LOC       0.99      0.99      0.99     24015
       MEDIA       0.94      0.98      0.96       916
        MYTH       0.89      0.75      0.81        64
         ORG       0.98      0.96      0.97      6614
         PER       0.98      1.00      0.99     10525
       PLANT       0.60      0.66      0.63      1788
        TIME       0.79      0.81      0.80       578
        VEHI       0.88      0.88      0.88        64

   micro avg       0.93      0.95      0.94     51247
   macro avg       0.78      0.81      0.79     51247
weighted avg       0.93   

In [12]:
!nvidia-smi

Mon Dec  4 11:01:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    28W /  70W |  10975MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Save a dictionary with the test set labels and predictions.

In [14]:
import pickle as pkl

outputs = {
  'predictions': predictions,
  'labels': labels,
  'key2index': key2index,
  'index2key': index2key
}

with open('./model_outputs.pkl', 'wb') as handle:
  pkl.dump(outputs, handle)