<a href="https://colab.research.google.com/github/Posfay/Named-Entity-Recognition-using-BERT/blob/main/NER_Training_and_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installs and imports

In [None]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install evaluate -q

In [None]:
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from datasets import Dataset, Sequence, ClassLabel, Value
from datasets import load_dataset, load_from_disk
from datasets import load_metric

import evaluate
import transformers
from transformers import (
    CONFIG_MAPPING,
    AutoConfig,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    HfArgumentParser,
    PushToHubCallback,
    TFAutoModelForTokenClassification,
    TFTrainingArguments,
    create_optimizer,
    set_seed,
    pipeline,
)

np.random.seed(42)

logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Splitting the dataset

In [None]:
# Mount google drive first, then you can load the dataset
not_tokenized_ds = load_from_disk("/content/drive/MyDrive/sztaki_full_pretokenized_repaired")

In [None]:
# Creating a DatasetDict which contains a train and test dataset
split_dataset = not_tokenized_ds.train_test_split(test_size=0.2, shuffle=True, seed=42)



In [None]:
split_dataset["train"]

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1039987
})

In [None]:
# dictionaries for label to id conversion and vice versa (the model needs these)
label_names = split_dataset["train"].features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

## Training

In [None]:
# Hyperparameters
sentence_max_length = 256
max_train_samples = 5000
max_eval_samples = 1000
num_replicas = 1
per_device_train_batch_size = 16
per_device_eval_batch_size = 16
total_train_batch_size = per_device_train_batch_size * num_replicas
total_eval_batch_size = per_device_eval_batch_size * num_replicas
num_train_epochs = 2
learning_rate = 2e-5
warmup_steps = 0
warmup_ratio = 0
return_entity_level_metrics = True
output_dir = "first_NER_model"

In [None]:
# Loading a tokenizer - this is a hungarian pretrained model
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

In [None]:
def tokenize_and_align_labels(all_samples_per_split):
    tokenized_samples = tokenizer(
        all_samples_per_split["tokens"],
        max_length=sentence_max_length,
        padding="max_length",
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )

    # labels replacing ner_tags in the dataset
    total_adjusted_labels = []

    # correcting the labels (ner_tags) for every token because of subword tokenization
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        # labels replacing ner_tags in the current sequence
        adjusted_label_ids = []

        for word_idx in word_ids_list:
            # Subword tokens have a word id that is None. We set the label to -100 
            # so they are automatically ignored in the loss function.
            if word_idx is None:
                adjusted_label_ids.append(-100)
            # if next token is a subword token, mark with same label (ner_tag)
            elif word_idx != prev_wid:
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = word_idx
            # if next token is a new word, add the correct label to the list
            else:
                # label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])

        # add current sequence's corrected labels to the dataset
        total_adjusted_labels.append(adjusted_label_ids)

    # add adjusted labels to the tokenized dataset
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [None]:
# Tokenization on the datasets
processed_raw_datasets = split_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["test"]

# Limiting the number of train or eval samples if specified
if max_train_samples > 0:
    max_train_samples = min(len(train_dataset), max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples))

if max_eval_samples > 0:
    max_eval_samples = min(len(eval_dataset), max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples))



In [None]:
# Loading a pretrained model for token classification - this is a hungarian pretrained model
model = TFAutoModelForTokenClassification.from_pretrained(
    "SZTAKI-HLT/hubert-base-cc",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id,
)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at SZTAKI-HLT/hubert-base-cc and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# We resize the embeddings only when necessary to avoid index errors
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# We need the DataCollatorForTokenClassification here, 
# as we need to correctly pad labels as well as inputs.
collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [None]:
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

In [None]:
# Converting the HuggingFace datasets to Tensorflow.data.Dataset
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=collate_fn,
    batch_size=total_train_batch_size,
    shuffle=True,
).with_options(dataset_options)

tf_eval_dataset = model.prepare_tf_dataset(
    eval_dataset,
    collate_fn=collate_fn,
    batch_size=total_eval_batch_size,
    shuffle=False,
).with_options(dataset_options)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
num_train_steps = int(len(tf_train_dataset) * num_train_epochs)
if warmup_steps > 0:
    num_warmup_steps = warmup_steps
elif warmup_ratio > 0:
    num_warmup_steps = int(num_train_steps * warmup_ratio)
else:
    num_warmup_steps = 0

# Creating an optimizer for the model
optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    # adam_beta1=adam_beta1,
    # adam_beta2=adam_beta2,
    # adam_epsilon=adam_epsilon,
    # weight_decay_rate=weight_decay,
    # adam_global_clipnorm=max_grad_norm,
)

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
# Creating the evaluation function for the model

metric = evaluate.load("seqeval")

def get_labels(y_pred, y_true):
    # Transform predictions and references tensors to numpy arrays

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(pred, gold_label) if l != -100]
        for pred, gold_label in zip(y_pred, y_true)
    ]
    return true_predictions, true_labels

def compute_metrics():
    results = metric.compute()
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [None]:
logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(train_dataset)}")
logger.info(f"  Num Epochs = {num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {per_device_train_batch_size}")
logger.info(f"  Total train batch size = {total_train_batch_size}")

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=int(num_train_epochs)
)

INFO:root:***** Running training *****
INFO:root:  Num examples = 5000
INFO:root:  Num Epochs = 2
INFO:root:  Instantaneous batch size per device = 16
INFO:root:  Total train batch size = 16


Epoch 1/2
  1/312 [..............................] - ETA: 4:39:29 - loss: 1.1885

KeyboardInterrupt: ignored

In [None]:
# Getting the predictions for the validation dataset
predictions = model.predict(tf_eval_dataset, batch_size=per_device_eval_batch_size)["logits"]
# Leaving only the most likely label for each token
predictions = tf.math.argmax(predictions, axis=-1).numpy()
labels = eval_dataset.with_format("tf")["labels"]
labels = labels.numpy()
# Hiding the predictions for any token that is hidden on the input
attention_mask = eval_dataset.with_format("tf")["attention_mask"]
attention_mask = attention_mask.numpy()
labels[attention_mask == 0] = -100
# Retrieving the true predictions and labels (excluding hidden tokens)
preds, refs = get_labels(predictions, labels)
metric.add_batch(
    predictions=preds,
    references=refs,
)
# Calculating and printing the metrics
eval_metric = compute_metrics()
logger.info("Evaluation metrics:")
for key, val in eval_metric.items():
    logger.info(f"{key}: {val:.4f}")



INFO:root:Evaluation metrics:
INFO:root:LOC_precision: 0.9272
INFO:root:LOC_recall: 0.9910
INFO:root:LOC_f1: 0.9580
INFO:root:LOC_number: 334.0000
INFO:root:MISC_precision: 0.8947
INFO:root:MISC_recall: 0.7727
INFO:root:MISC_f1: 0.8293
INFO:root:MISC_number: 22.0000
INFO:root:ORG_precision: 0.5652
INFO:root:ORG_recall: 0.7647
INFO:root:ORG_f1: 0.6500
INFO:root:ORG_number: 17.0000
INFO:root:PER_precision: 0.8908
INFO:root:PER_recall: 0.8465
INFO:root:PER_f1: 0.8681
INFO:root:PER_number: 241.0000
INFO:root:overall_precision: 0.8997
INFO:root:overall_recall: 0.9202
INFO:root:overall_f1: 0.9098
INFO:root:overall_accuracy: 0.9961


In [None]:
model.save_pretrained(output_dir)

## Predicting with the model

In [None]:
# Using HuggingFace pipeline we can create a ready-to-use model from the one 
# fine-tuned before
token_classifier = pipeline(
    "token-classification",
    tokenizer=tokenizer,
    model=model,
    aggregation_strategy="simple", 
)

In [None]:
input_text = "Pista megette az összes hamburgert Los Angelesben miután találkozott Trumppal"

In [None]:
token_classifier(input_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'LABEL_1',
  'score': 0.91885906,
  'word': 'Pista',
  'start': 0,
  'end': 5},
 {'entity_group': 'LABEL_0',
  'score': 0.9995634,
  'word': 'megette az összes hamburgert',
  'start': 6,
  'end': 34},
 {'entity_group': 'LABEL_3',
  'score': 0.961508,
  'word': 'Los Angelesben',
  'start': 35,
  'end': 49},
 {'entity_group': 'LABEL_0',
  'score': 0.9994347,
  'word': 'miután találkozott',
  'start': 50,
  'end': 68},
 {'entity_group': 'LABEL_1',
  'score': 0.9570334,
  'word': 'Trumppal',
  'start': 69,
  'end': 77}]