# NER using Transformer models and special tokenization

Dataset used is a dictionary with the following lists:
- tokens (list of words for each sentence): ["This", "is", "an", "example", "."]
- ner_tags (list of NER tags for each word): ['O', "O", 'O', 'O', 'O']

## Tokenization
Tokenizers separate words to subword level. To make the model work on world level, we need to create our own tokenizer.

In [1]:
# Define the gpu on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=7

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=7


In [2]:
#!pip install transformers==4.28.0
from transformers import AutoTokenizer
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
import json
import pandas as pd
from transformers import AutoConfig
import torch
# use the seqeval package which has measures for evaluation sequence classification
#!pip install seqeval
import numpy as np
from seqeval.metrics import f1_score

In [3]:
# Import the dataset

# Code for python script
"""
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", help="path to the dataset in JSON format")
    args = parser.parse_args()

# Define the path to the dataset
dataset_path = args.dataset
"""
# Define the path to the dataset
dataset_path = "datasets/hr500k.conllup_extracted.json"

# Load the json file
with open(dataset_path, "r") as file:
    json_dict = json.load(file)

# Open the train, eval and test dictionaries as DataFrames
train_df = pd.DataFrame(json_dict["train"])
test_df = pd.DataFrame(json_dict["test"])
dev_df = pd.DataFrame(json_dict["dev"])

# Define the labels
LABELS = json_dict["labels"]
print(LABELS)

print(train_df.shape, test_df.shape, dev_df.shape)
print(train_df.head())

['O', 'B-loc', 'B-org', 'B-per', 'I-per', 'B-deriv-per', 'I-org', 'I-loc', 'B-misc', 'I-misc', 'I-deriv-per']
(398681, 3) (51190, 3) (49764, 3)
    sentence_id      words labels
717  set.hr-s36      Kazna      O
718  set.hr-s36  medijskom      O
719  set.hr-s36     mogulu      O
720  set.hr-s36   obnovila      O
721  set.hr-s36   raspravu      O


In [None]:
# To use torch and transformers, we need to transform the dataset into a specific format. We need to create two lists: token_docs which is a list of lists of token strings, and token_tags which is a list of lists of tag strings.

In [20]:
train_df.head()

Unnamed: 0,sentence_id,words,labels
717,set.hr-s36,Kazna,O
718,set.hr-s36,medijskom,O
719,set.hr-s36,mogulu,O
720,set.hr-s36,obnovila,O
721,set.hr-s36,raspravu,O


In [88]:
# Define encodings for the NER tags - get it from the dataset (labels), e.g., tags = ["O", "B-PER", "I-PER"]

index2tag = {idx: tag for idx, tag in enumerate(LABELS)}
tag2index = {tag: idx for idx, tag in enumerate(LABELS)}

print(index2tag, tag2index, sep="\n")

{0: 'O', 1: 'B-loc', 2: 'B-org', 3: 'B-per', 4: 'I-per', 5: 'B-deriv-per', 6: 'I-org', 7: 'I-loc', 8: 'B-misc', 9: 'I-misc', 10: 'I-deriv-per'}
{'O': 0, 'B-loc': 1, 'B-org': 2, 'B-per': 3, 'I-per': 4, 'B-deriv-per': 5, 'I-org': 6, 'I-loc': 7, 'B-misc': 8, 'I-misc': 9, 'I-deriv-per': 10}


In [87]:
train_df

Unnamed: 0,sentence_id,words,labels
717,set.hr-s36,Kazna,O
718,set.hr-s36,medijskom,O
719,set.hr-s36,mogulu,O
720,set.hr-s36,obnovila,O
721,set.hr-s36,raspravu,O
...,...,...,...
499630,prosir-s120,nećemo,O
499631,prosir-s120,tako,O
499632,prosir-s120,skoro,O
499633,prosir-s120,zaboraviti,O


In [90]:
# Map numbers for tags to the dataframe
train_df["labels_index"] = [tag2index[x] for x in train_df.labels]
train_df.head(20)

Unnamed: 0,sentence_id,words,labels,labels_index
717,set.hr-s36,Kazna,O,0
718,set.hr-s36,medijskom,O,0
719,set.hr-s36,mogulu,O,0
720,set.hr-s36,obnovila,O,0
721,set.hr-s36,raspravu,O,0
722,set.hr-s36,u,O,0
723,set.hr-s36,Makedoniji,B-loc,1
724,set.hr-s37,Neki,O,0
725,set.hr-s37,tvrde,O,0
726,set.hr-s37,da,O,0


In [93]:
# Map numbers for tags to the dataframe
dev_df["labels_index"] = [tag2index[x] for x in dev_df.labels]
dev_df.head(20)

Unnamed: 0,sentence_id,words,labels,labels_index
0,set.hr-s1,Proces,O,0
1,set.hr-s1,privatizacije,O,0
2,set.hr-s1,na,O,0
3,set.hr-s1,Kosovu,B-loc,1
4,set.hr-s1,pod,O,0
5,set.hr-s1,povećalom,O,0
6,set.hr-s2,Kosovo,B-loc,1
7,set.hr-s2,ozbiljno,O,0
8,set.hr-s2,analizira,O,0
9,set.hr-s2,proces,O,0


In [91]:
# Transform dataset to get the format we need

def read_dataframe(df):
    token_docs = []
    tag_docs = []

    for i in df.sentence_id.unique():
        subset = df[df["sentence_id"] == i]
        current_word_list = subset.words.to_list()
        current_label_list = subset.labels_index.to_list()
        token_docs.append(current_word_list)
        tag_docs.append(current_label_list)

    return token_docs, tag_docs


In [92]:
train_texts, train_tags = read_dataframe(train_df)

In [95]:
# Do the same for eval and test
eval_texts, eval_tags = read_dataframe(dev_df)
#test_texts, test_tags = read_dataframe(test_df)

In [96]:
print(train_texts[0], train_tags[0], sep="\n")

['Kazna', 'medijskom', 'mogulu', 'obnovila', 'raspravu', 'u', 'Makedoniji']
[0, 0, 0, 0, 0, 0, 1]


In [None]:
# For Slovenian datasets: create train, test, val split
#from sklearn.model_selection import train_test_split
#train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [97]:
# Define the base model which serves as the foundation for tokenization and fine-tuning
xlmr_model_name = "xlm-roberta-base"

xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

# Define our own XLM-R-based tokenizer
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [98]:
# Create a function to connect indices for tags with tag names

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=len(LABELS), id2label=index2tag, label2id=tag2index)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

In [99]:
# We create new id tags for each word - we iterate through the words
# and if it isn't a new word, we assign the IGN tag (ignore) to it

def tokenize_and_align_labels(texts, tags):
    encodings = xlmr_tokenizer(texts, truncation=True, is_split_into_words=True)
    encoded_labels = []
    for idx, label in enumerate(tags):
        word_ids = encodings.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100) # new IGN tag!
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        encoded_labels.append(label_ids)
    return encodings, encoded_labels

In [100]:
train_encodings, train_encoded_labels = tokenize_and_align_labels(train_texts, train_tags)

In [101]:
# Repeat with eval set
eval_encodings, eval_encoded_labels = tokenize_and_align_labels(eval_texts, eval_tags)

In [102]:
# Now we need to create a torch dataset object

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [103]:
train_dataset = NERDataset(train_encodings, train_encoded_labels)
eval_dataset = NERDataset(eval_encodings, eval_encoded_labels)

In [104]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

def model_init():
    return (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

## Model Fine-Tuning

In [105]:
from transformers import TrainingArguments, DataCollatorForTokenClassification, Trainer
num_epochs = 3
batch_size = 24
logging_steps = len(train_texts) // batch_size
model_name = f"{xlmr_model_name}-finetuned-test"
training_args = TrainingArguments(output_dir=model_name,
                                 log_level="error",
                                 num_train_epochs=num_epochs,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 evaluation_strategy="epoch",
                                 save_steps=1e6,
                                 weight_decay=0.01,
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False
                                )

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  tokenizer=xlmr_tokenizer)
trainer.train()

finetuned_model = trainer.model



Epoch,Training Loss,Validation Loss,F1
1,0.1,0.047518,0.838362
2,0.032,0.040806,0.870326
3,0.0149,0.042693,0.881725


In [119]:
# Let's test the model
text = "Ime mi je Taja."

def compute_metrics_test(text):
    tokens = xlmr_tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs  =  finetuned_model(input_ids)
    predictions = torch.argmax(outputs[0], dim=2)
    preds = [LABELS[p] for p in predictions[0].cpu().numpy()]
    y_pred, y_true = align_predictions(preds, input_ids)
    return {}

compute_metrics_test(text)

AxisError: axis 2 is out of bounds for array of dimension 1

In [117]:

print(preds)


['O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O']


In [108]:
# let's test the model
def tag_text(text, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)
    predictions = torch.argmax(outputs[0], dim=2)
    preds = [LABELS[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de, finetuned_model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,<s>,▁Jeff,▁De,an,▁ist,▁ein,▁Informati,ker,▁bei,▁Google,▁in,▁Kaliforni,en,</s>
Tags,O,B-per,I-per,I-per,O,O,O,O,O,B-org,O,B-loc,I-loc,O
