**Multilingual NER model trained over [Chia dataset](https://figshare.com/articles/dataset/Chia_Annotated_Datasets/11855817)**

We are going to train a BERT based multilingual language model over the Chia dataset in english and then we will use this model to create the synthetic version of the dataset in French. Our idea is supported by the experiments already done with [multiNERD](https://huggingface.co/datasets/Babelscape/multinerd) dataset for multilingual NER in English and French.

**Entities selection**

Among all the entities in the dataset, we are going to focus for this project on the most represented ones. Then, we are just going to consider those entities with more than 1000 samples in total.

In [108]:
# imports
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import os
from preprocessing_dataset import *
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
import json
from datasets.features import ClassLabel

In [92]:
# dict for the entities (entity to int value)
sel_ent = {
    "O": 0,
    "B-Condition": 1,
    "I-Condition": 2,
    "B-Value": 3,
    "I-Value": 4,
    "B-Drug": 5,
    "I-Grug": 6,
    "B-Procedure": 7,
    "I-Procedure": 8,
    "B-Measurement": 9,
    "I-Measurement": 10,
    "B-Temporal": 11,
    "I-Temporal": 12,
    "B-Observation": 13,
    "I-Observation": 14,
    "B-Person": 15,
    "I-Person": 16
}
entities_list = list(sel_ent.keys())
sel_ent_inv = {v: k for k, v in sel_ent.items()}

In [93]:
# data paths
data_path = "../data"
chia_bio_path = f"{data_path}/chia_bio"
chia_prep_path = f"{data_path}/chia_prep"

In [94]:
# preprocessing dataset to get the data in the right format for dataset entity creation
preprocessing_dataset(data_path, chia_bio_path, output_path=chia_prep_path)

NameError: name 'preprocessing_dataset' is not defined

In [96]:
# read the data after preprocessing
files = os.listdir(chia_prep_path)
files

['Example.txt']

In [97]:
sentences = []

for file in files:
    with open(f"{chia_prep_path}/{file}", "r") as f:
        stc = json.load(f)
        sentences.extend(stc["sentences"])

In [98]:
# create the dataset
chia_eng_dataset = Dataset.from_pandas(pd.DataFrame(sentences))

In [99]:
chia_eng_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 10
})

In [103]:
chia_eng_train_test = chia_eng_dataset.train_test_split(test_size=0.2)
chia_eng_test_val = chia_eng_train_test["test"].train_test_split(test_size=0.5)
chia_eng_dataset = DatasetDict({
    "train": chia_eng_train_test["train"],
    "test": chia_eng_test_val["test"],
    "validation": chia_eng_test_val["train"]
})

In [104]:
chia_eng_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 8
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1
    })
})

**Model Implementation**

In [84]:
model_name = 'xlm-roberta-base'

In [85]:
# get xlm-roberta tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# check the tokenizer
tokens_ = tokenizer("The AI master at Université Paris-Saclay is very good").tokens()
print(tokens_)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

['<s>', '▁The', '▁AI', '▁master', '▁at', '▁', 'Université', '▁Paris', '-', 'S', 'ac', 'lay', '▁is', '▁very', '▁good', '</s>']


In [86]:
# tokenize and align the labels in the dataset
def tokenize_and_align_labels(sentence, flag = 'I'):
    """
    Tokenize the sentence and align the labels
    inputs:
        sentence: dict, the sentence from the dataset
        flag: str, the flag to indicate how to deal with the labels for subwords
            - 'I': use the label of the first subword for all subwords but as intermediate (I-ENT)
            - 'B': use the label of the first subword for all subwords as beginning (B-ENT)
            - None: use -100 for subwords
    outputs:
        tokenized_sentence: dict, the tokenized sentence now with a field for the labels
    """
    tokenized_sentence = tokenizer(sentence['tokens'], is_split_into_words=True, truncation=True)

    labels = []
    for i, labels_s in enumerate(sentence['ner_tags']):
        word_ids = tokenized_sentence.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # if the word_idx is None, assign -100
            if word_idx is None:
                label_ids.append(-100)
            # if it is a new word, assign the corresponding label
            elif word_idx != previous_word_idx:
                label_ids.append(labels_s[word_idx])
            # if it is the same word, check the flag to assign
            else:
                if flag == 'I':
                    if label_list[labels_s[word_idx]].startswith('I'):
                      label_ids.append(labels_s[word_idx])
                    else:
                      label_ids.append(labels_s[word_idx] + 1)
                elif flag == 'B':
                    label_ids.append(labels_s[word_idx])
                elif flag == None:
                    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_sentence['labels'] = labels
    return tokenized_sentence

In [89]:
type(chia_eng_dataset)

dict

In [105]:
# apply the function to the dataset
chia_eng_dataset = chia_eng_dataset.map(tokenize_and_align_labels, batched=True)
chia_eng_dataset

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})

In [107]:
# import the model
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(entities_list), label2id=sel_ent, id2label=sel_ent_inv)
print(model)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model.to(device)

In [None]:
# define the training arguments
args = TrainingArguments(
    "chia-multilingual-ner",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    eval_steps=50,
    save_steps=50,
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

In [None]:
def compute_metrics(p):
    """
    Compute the metrics for the model
    inputs:
        p: tuple, the predictions and the labels
    outputs:
        dict: the metrics
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# define the trainer
trainer = Trainer(
    model,
    args,
    train_dataset=chia_eng_dataset["train"],
    eval_dataset=chia_eng_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
outputs_train = trainer.train()

In [None]:
print(outputs_train)

In [None]:
outputs_eval = trainer.evaluate(chia_eng_dataset["test"])

In [None]:
print(outputs_eval)

In [109]:
torch.save(model, f"{models_path}/chia-multilingual-ner.pt")

SyntaxError: f-string: expecting '}' (2477619478.py, line 1)