# Train BERT text for Named Entity Recognition (exercise)

In this exercise, we'll create a Finnish named entity recognition model by fine-tuning BERT for the task.

You should already be familiar with most of this code from the exercise on [sentiment classification with BERT](https://moodle.utu.fi/mod/url/view.php?id=1095675) and the course material on Named Entity Recognition. However, some settings are missing or far from optimal. The goals of the exercise are

1. Find a Finnish BERT model in the Hugging Face [model repository](https://huggingface.co/models) and the Turku NER corpus in the [dataset repository](https://huggingface.co/datasets)
2. Modify the code to use the Finnish BERT model and fine-tune on the Turku NER corpus
3. Modify the hyperparameters to optimize the performance of your Finnish NER tagger. What performance (F-score) can you achieve, and what are the best settings you found?
4. Try to find at least three example sentences (e.g. from Wikipedia or news) where the tagger makes a mistake. Can you explain the errors of the system?

Install libraries

In [None]:
!pip --quiet install transformers
!pip --quiet install datasets
!pip --quiet install seqeval

Imports and global settings

In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset, load_metric


MODEL_NAME = ''
DATASET = ''
MAX_LENGTH=128
DUMMY_LABEL_ID = -100    # Don't change this!

train_args = TrainingArguments(
    'output_dir',
    save_strategy='no',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
)

Load dataset

In [None]:
dataset = load_dataset(DATASET)
label_list = dataset["train"].features['ner_tags'].feature.names
num_labels = len(label_list)

Downsample training data

In [None]:
dataset['train'] = dataset['train'].filter(lambda example, idx: idx % 10 == 0, with_indices=True)

Load tokenizer and model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

Encode data

In [None]:
def encode_dataset(data):
    tokenized = tokenizer(
        data['tokens'],
        is_split_into_words=True,
        truncation=True,
        max_length=MAX_LENGTH
    )
    labels = []
    prev_word_idx = None
    for word_idx in tokenized.word_ids():
        if word_idx is None or word_idx == prev_word_idx:
            labels.append(DUMMY_LABEL_ID)
        else:
            labels.append(data['ner_tags'][word_idx])
        prev_word_idx = word_idx
    tokenized['labels'] = labels
    return tokenized


encoded_dataset = dataset.map(encode_dataset)

Metrics

In [None]:
seq_eval_metrics = load_metric('seqeval')


def compute_metrics(pred):
    true_ids = pred.label_ids
    pred_ids = pred.predictions.argmax(axis=2)
    true_word_tags = [
        [label_list[l] for (p, l) in zip(sent_pred, sent_true) if l != DUMMY_LABEL_ID]
        for sent_pred, sent_true in zip(pred_ids, true_ids)
    ]
    pred_word_tags = [
        [label_list[p] for (p, l) in zip(sent_pred, sent_true) if l != DUMMY_LABEL_ID]
        for sent_pred, sent_true in zip(pred_ids, true_ids)
    ]
    results = seq_eval_metrics.compute(predictions=pred_word_tags, references=true_word_tags)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1'],
        'accuracy': results['overall_accuracy'],
    }

Training

In [None]:
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    label_pad_token_id=DUMMY_LABEL_ID,
)

trainer = Trainer(
    model,
    train_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Evaluation

In [None]:
results = trainer.evaluate()
print(f'Accuracy : {results["eval_accuracy"]:.1%}\n'
      f'Precision: {results["eval_precision"]:.1%}\n'
      f'Recall   : {results["eval_recall"]:.1%}\n'
      f'F1-score : {results["eval_f1"]:.1%}')

Helper functions

In [10]:
model = trainer.model
model.eval()    # switch to evaluation mode
model.to('cpu')    # switch to CPU


def word_start_tokens(tokenized):
    """Return list of bool identifying which tokens start words."""
    prev_word_idx = None
    is_word_start = []
    for word_idx in tokenized.word_ids():
        if word_idx is None or word_idx == prev_word_idx:
            is_word_start.append(False)
        else:
            is_word_start.append(True)
        prev_word_idx = word_idx
    return is_word_start


def predict_ner(words):
    tokenized = tokenizer(words, is_split_into_words=True, return_tensors='pt')
    pred = model(**tokenized)
    pred_idx = pred.logits.detach().numpy().argmax(axis=2)
    token_labels = [label_list[i] for s in pred_idx for i in s]
    word_labels = []
    for label, is_word_start in zip(token_labels, word_start_tokens(tokenized)):
        if is_word_start:
            word_labels.append(label)
    return word_labels

Visualization

In [None]:
from spacy import displacy


example_sentences = [
    'Turku on Suomen vanhin kaupunki',
    'Larry Page ja Sergey Brin ovat Googlen perustajat',
]

# Mapping of CoNLL'03 types for displacy
type_map = {
    'PER': 'PERSON',
}

def render_with_displacy(words, tags):
    tagged, offset, start, label = [], 0, None, None
    for word, tag in zip(words, tags):
        if tag[0] in 'OB' and start is not None:    # current ends
            tagged.append({
                'start': start,
                'end': offset,
                'label': type_map.get(label, label)
            })
            start, label = None, None
        if tag[0] == 'B':
            start, label = offset, tag[2:]
        elif tag[0] == 'I':
            if start is None:    # I without B, but nevermind
                start, label = offset, tag[2:]
        else:
            assert tag == 'O', 'unexpected tag {}'.format(tag)
        offset += len(word) + 1    # +1 for space
    if start:    # span open at sentence end
        tagged.append({
                'start': start,
                'end': offset,
                'label': type_map.get(label, label)
        })
    doc = {
        'text': ' '.join(words),
        'ents': tagged
    }
    displacy.render(doc, style='ent', jupyter=True, manual=True)


for e in example_sentences:
    words = e.split()    # Note: assumes white-space tokenization is OK
    ner_tags = predict_ner(words)
    render_with_displacy(words, ner_tags)