# Exercise 5 - Sequence and Sentiment Classification using Transformers


In [1]:
import datasets

In [2]:
import numpy as np

## part1. Named Entity Recognition using BERT

Implement a named entity recognition system for your chosen language. Use HuggingFace’s BertForTokenClassification-class and
initialize it with a pretrained Hugging Face BERT-base model of your chosen language. This HuggingFace guide for fine-tuning
serves as a good starting point. Before passing the data to the model, you need to encode it using a HuggingFace tokenizer. Use
the tokenizer corresponding to your BERT model. When provided with the right arguments, the tokenizer can also pad and truncate
the input.
You can reduce the amount of code for this exercise by using the Trainer class explained at the bottom of the HuggingFace guide.
You will create 3 fine-tuned versions of the system:
1. Fine-tuned with 1’000 sentences
2. Fine-tuned with 3’000 sentences
3. Fine-tuned with 3’000 sentences and frozen embeddings
Let each fine-tuned model predict on the evaluation set to compute f1-micro and f1-macro scores.


In [3]:
# load dataset with huggingface load_dataset 
train1 = datasets.load_dataset('polyglot_ner', 'de', split='train[:1000]')
train2 = datasets.load_dataset('polyglot_ner', 'de', split='train[:3000]')
test = datasets.load_dataset('polyglot_ner', 'de', split='train[-2000:]')

print(train1)
print(train2)
print(test)

Reusing dataset polyglot_ner (C:\Users\songy\.cache\huggingface\datasets\polyglot_ner\de\1.0.0\bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1)
Reusing dataset polyglot_ner (C:\Users\songy\.cache\huggingface\datasets\polyglot_ner\de\1.0.0\bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1)
Reusing dataset polyglot_ner (C:\Users\songy\.cache\huggingface\datasets\polyglot_ner\de\1.0.0\bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1)


Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 1000
})
Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 3000
})
Dataset({
    features: ['id', 'lang', 'words', 'ner'],
    num_rows: 2000
})


## label encoding

In [4]:
# build a vocaburary set to map ner tags to labels

ner_vocab = set()

for row in train1['ner']:
    for tag in row:
        ner_vocab.add(tag)

print(ner_vocab)

{'LOC', 'PER', 'O', 'ORG'}


In [5]:
# label encoding  

tags_to_labels = {tag: i for i, tag in enumerate(ner_vocab)}

print(tags_to_labels)

{'LOC': 0, 'PER': 1, 'O': 2, 'ORG': 3}


## Tokenizing

In [6]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

In [7]:
max_len_1 = max([len(row['words']) for row in train1])
max_len_2 = max([len(row['words']) for row in train2])
max_len_3 = max([len(row['words']) for row in test])

print(max_len_1, max_len_2, max_len_3)

max_len = max_len_3

71 154 158


In [9]:
def encode_dataset(dataset):
    encoded_dataset = []
    for words, ners in zip(dataset['words'], dataset['ner']):
        enc = tokenizer(words, return_tensors="pt", padding='max_length', max_length=max_len, truncation=True, is_split_into_words=True)
        enc['labels'] = torch.zeros(1, max_len, dtype=torch.long)
        for i, tag in enumerate(ners[:max_len]):
            enc['labels'][0][i] = tags_to_labels[tag]
        for key in enc:
            enc[key] = torch.squeeze(enc[key])
        encoded_dataset.append(enc)
        
    return encoded_dataset

In [10]:
# encode the two training sets and the evaluation set

enc_train1 = encode_dataset(train1)
enc_train2 = encode_dataset(train2)
enc_test = encode_dataset(test)


## Prepare the models

In [11]:
from transformers import BertForTokenClassification, Trainer, TrainingArguments

In [12]:
def freeze_weights(model):
    for param in model.base_model.parameters():
        param.requires_grad = False
    return model

In [13]:
# define a model 
model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(ner_vocab))

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

In [18]:
# parameters 1 : fine-tuned with 1000 sentence

train_param1 = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    output_dir='result1',
    logging_dir='logs1',
    no_cuda=False,  
)

trainer1 = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=train_param1,
    train_dataset=enc_train1,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [19]:
# parameters 2 : fine-tuned with 3000 sentence

train_param2 = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    output_dir='result2',
    logging_dir='logs2',
    no_cuda=False,
)

trainer2 = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=train_param2,
    train_dataset=enc_train2,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
# parameters 3 : fine-tuned with 3000 sentences and frozen embeddings

train_param3 = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    output_dir='results3',
    logging_dir='logs3',
    no_cuda=False,
)

trainer3 = Trainer(
    model=freeze_weights(model),
    tokenizer=tokenizer,
    args=train_param2,
    train_dataset=enc_train2,
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Train and Predict

In [21]:
trainer1.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 250


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=250, training_loss=1.155072021484375, metrics={'train_runtime': 567.227, 'train_samples_per_second': 1.763, 'train_steps_per_second': 0.441, 'total_flos': 80636004048000.0, 'train_loss': 1.155072021484375, 'epoch': 1.0})

In [24]:
preds1 = trainer1.predict(enc_test)

# play around with preds - preds.label_ids, preds.predictions

***** Running Prediction *****
  Num examples = 2000
  Batch size = 1


In [49]:
trainer2.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Step,Training Loss
500,0.6891


Saving model checkpoint to result2\checkpoint-500
Configuration saved in result2\checkpoint-500\config.json
Model weights saved in result2\checkpoint-500\pytorch_model.bin
tokenizer config file saved in result2\checkpoint-500\tokenizer_config.json
Special tokens file saved in result2\checkpoint-500\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.6657684529622396, metrics={'train_runtime': 2238.4188, 'train_samples_per_second': 1.34, 'train_steps_per_second': 0.335, 'total_flos': 235783758672000.0, 'train_loss': 0.6657684529622396, 'epoch': 1.0})

In [50]:
preds2 = trainer2.predict(enc_test)

***** Running Prediction *****
  Num examples = 2000
  Batch size = 1


In [None]:
trainer3.train()

In [None]:
preds3 = trainer3.predict(enc_test)

## Report the results

In [30]:
from sklearn.metrics import f1_score

num_words_list = [len(words) for words in test['words']]

def print_f1score(preds):

    preds_list = []
    labels_list = []
    
    for i, (num_words, label_ids, label_preds) in enumerate(zip(num_words_list, preds.label_ids, preds.predictions)):
        label_true = label_ids[:num_words]
        labels_list.extend(label_true)
        preds = label_preds[:num_words]
        preds = preds.argmax(-1)
        preds_list.extend(preds)
        
    f1_micro = f1_score(labels_list, preds_list, average='micro')
    f1_macro = f1_score(labels_list, preds_list, average='macro')

    print(f"fl_micro is {f1_micro} and f1_macro is {f1_macro}",)

In [31]:
print("="*60)
print("result of parameters 1) fine-tuned with 1000 sentence")
print_f1score(preds1)

print("="*60)
print("result of parameters 2) fine-tuned with 3000 sentence")
#print_f1score(preds2)

print("="*60)
print("result of parameters 3) fine-tuned with 3000 sentence and frozen embeddings")
#print_f1score(preds3)

result of parameters 1) fine-tuned with 1000 sentence
fl_micro is 0.7013919698314108 and f1_macro is 0.22175120580593963
result of parameters 2) fine-tuned with 3000 sentence
result of parameters 3) fine-tuned with 3000 sentence and frozen embeddings
