# Named Entity Recognition using BERT


In [None]:
!pip install transformers datasets sklearn

In [None]:
import numpy as np

import datasets
from transformers import BertTokenizer
from transformers import BertForTokenClassification, Trainer, TrainingArguments

import torch
from sklearn.metrics import f1_score

from random import shuffle


## 1. data load

In [3]:
# load dataset with huggingface load_dataset 
dataset = datasets.load_dataset('polyglot_ner', 'de', split='train[:5000]')

Downloading builder script:   0%|          | 0.00/6.01k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Downloading and preparing dataset polyglot_ner/de (download: 1.03 GiB, generated: 149.48 MiB, post-processed: Unknown size, total: 1.18 GiB) to /root/.cache/huggingface/datasets/polyglot_ner/de/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1...


Downloading data:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/547578 [00:00<?, ? examples/s]

Dataset polyglot_ner downloaded and prepared to /root/.cache/huggingface/datasets/polyglot_ner/de/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1. Subsequent calls will reuse this data.


In [7]:
dataset

Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 5000
})

In [8]:
dataset[0]

{'id': '0',
 'lang': 'de',
 'words': ['Im',
  'Jahr',
  '2011',
  'hatte',
  'die',
  'Gemeinde',
  'etwas',
  'mehr',
  'als',
  '3700',
  'Mitglieder',
  '.'],
 'ner': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}

## 2. label encoding

In [4]:
# build a vocaburary set to map ner tags to labels

ner_vocab = set()

for row in dataset['ner']:
    for tag in row:
        ner_vocab.add(tag)

print(ner_vocab)

{'LOC', 'ORG', 'PER', 'O'}


In [5]:
# label encoding  

tags_to_labels = {tag: i for i, tag in enumerate(ner_vocab)}

print(tags_to_labels)

{'LOC': 0, 'ORG': 1, 'PER': 2, 'O': 3}


## 3. Tokenizing 

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

## 4. dataset encoding 

In [9]:
max_len = max([len(row['words']) for row in dataset])


print("the largest word vector in dataset: ", max_len)

the largest word vector in dataset:  154


In [10]:
def encode_dataset(dataset):
    encoded_dataset = []
    for words, ners in zip(dataset['words'], dataset['ner']):
        enc = tokenizer(words, return_tensors="pt", padding='max_length', max_length=max_len, truncation=True, is_split_into_words=True)
        enc['labels'] = torch.zeros(1, max_len, dtype=torch.long)
        for i, tag in enumerate(ners[:max_len]):
            enc['labels'][0][i] = tags_to_labels[tag]
        for key in enc:
            enc[key] = torch.squeeze(enc[key])
        encoded_dataset.append(enc)
        
    return encoded_dataset

In [11]:
encoded_dataset = encode_dataset(dataset)


## 5. split datasets 

In [12]:
shuffle(encoded_dataset)

train1 = encoded_dataset[:1000]
train2 = encoded_dataset[:3000]
testset = encoded_dataset[-2000:]

## 6. build models

In [13]:
def freeze_weights(model):
    for param in model.base_model.parameters():
        param.requires_grad = False
    return model

In [14]:
model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(ner_vocab))

Downloading:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

### 6.1. Parameter 1) fine-tuned with 1000 sentence

In [15]:
train_param1 = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    output_dir='result1',
    logging_dir='logs1',
    no_cuda=False,  
)

# we don't need the batch dimension when using the trainer
# because the trainer does batching for us 

trainer1 = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=train_param1,
    train_dataset=train1,
)

### 6.2. Parameter 2) fine-tuned with 3000 sentence

In [16]:
train_param2 = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    output_dir='result2',
    logging_dir='logs2',
    no_cuda=False,
)

trainer2 = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=train_param2,
    train_dataset=train2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### 6.3. parameters 3) fine-tuned with 3000 sentences and frozen embeddings

In [17]:
train_param3 = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    output_dir='results3',
    logging_dir='logs3',
    no_cuda=False,
)

trainer3 = Trainer(
    model=freeze_weights(model),
    tokenizer=tokenizer,
    args=train_param2,
    train_dataset=train2,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## 7. train and predict 

In [18]:
trainer1.train()


***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 250


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=250, training_loss=0.716073974609375, metrics={'train_runtime': 504.6648, 'train_samples_per_second': 1.982, 'train_steps_per_second': 0.495, 'total_flos': 78594586224000.0, 'train_loss': 0.716073974609375, 'epoch': 1.0})

In [19]:
preds1 = trainer1.predict(testset)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 1


In [20]:
trainer2.train()


***** Running training *****
  Num examples = 3000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Step,Training Loss
500,0.3084


Saving model checkpoint to result2/checkpoint-500
Configuration saved in result2/checkpoint-500/config.json
Model weights saved in result2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in result2/checkpoint-500/tokenizer_config.json
Special tokens file saved in result2/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.27826983642578124, metrics={'train_runtime': 1474.6284, 'train_samples_per_second': 2.034, 'train_steps_per_second': 0.509, 'total_flos': 235783758672000.0, 'train_loss': 0.27826983642578124, 'epoch': 1.0})

In [21]:
preds2 = trainer2.predict(testset)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 1


In [22]:
trainer3.train()


***** Running training *****
  Num examples = 3000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Step,Training Loss
500,0.1905


Saving model checkpoint to result2/checkpoint-500
Configuration saved in result2/checkpoint-500/config.json
Model weights saved in result2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in result2/checkpoint-500/tokenizer_config.json
Special tokens file saved in result2/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.18410054524739583, metrics={'train_runtime': 1467.9593, 'train_samples_per_second': 2.044, 'train_steps_per_second': 0.511, 'total_flos': 235783758672000.0, 'train_loss': 0.18410054524739583, 'epoch': 1.0})

In [23]:
preds3 = trainer3.predict(testset)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 1


## 8. result 

In [24]:
def print_f1score(preds):

    preds_list = []
    labels_list = []
    
    for i, (label_ids, label_preds) in enumerate(zip(preds.label_ids, preds.predictions)):
        labels_list.extend(label_ids)
        preds = label_preds.argmax(-1)
        preds_list.extend(preds)
        
    f1_micro = f1_score(labels_list, preds_list, average='micro')
    f1_macro = f1_score(labels_list, preds_list, average='macro')

    print(f"fl_micro: {f1_micro} and f1_macro: {f1_macro}",)

In [26]:
print("="*20)
print("result of parameters 1 : fine-tuned with 1000 sentence")
print_f1score(preds1)

print("="*20)
print("result of parameters 2 : fine-tuned with 3000 sentence")
print_f1score(preds2)

print("="*20)
print("result of parameters 3 : fine-tuned with 3000 sentence and frozen embeddings")
print_f1score(preds3)

result of parameters 1 : fine-tuned with 1000 sentence
fl_micro: 0.8871006493506494 and f1_macro: 0.27012262051159314
result of parameters 2 : fine-tuned with 3000 sentence
fl_micro: 0.9293376623376624 and f1_macro: 0.4018194818601082
result of parameters 3 : fine-tuned with 3000 sentence and frozen embeddings
fl_micro: 0.9422175324675325 and f1_macro: 0.4270695073208298
