**Experiments training over french corpus to compare with cross-lingual approach**

We are going to train the same model (*xml-roberta-base*) as we did for english multinerd corpus, now over the french corpus, increasing the size of the training set each time. Then we are going to compare the results obtained over the test dataset in french for each case.

In [None]:
# uncomment to install required dependencies in colab
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets
!pip install seqeval
!pip install -q -U wandb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric
import torch
import accelerate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import wandb

In [None]:
labels_vocab = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
}

label_list = list(labels_vocab.keys())
labels_vocab_reverse = {v:k for k,v in labels_vocab.items()}

In [None]:
model_name = "xlm-roberta-base"

In [None]:
dataset = load_dataset("Babelscape/multinerd")

In [None]:
# get split of the dataset
data_train = dataset['train']
data_test = dataset['test']
data_val = dataset['validation']

In [None]:
# check the format of the dataset
dataset

In [None]:
# now we are going to take just the french part of the dataset
# French
data_train_fr = data_train.filter(lambda example: example['lang'] == 'fr')
data_test_fr = data_test.filter(lambda example: example['lang'] == 'fr')
data_val_fr = data_val.filter(lambda example: example['lang'] == 'fr')
print(f"Distribution of French data:\nTrain: {len(data_train_fr)}\nTest: {len(data_test_fr)}\nVal: {len(data_val_fr)}")

In [None]:
# get xlm-roberta tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# tokenize and align the labels in the dataset
def tokenize_and_align_labels(sentence, flag = 'I'):
    """
    Tokenize the sentence and align the labels
    inputs:
        sentence: dict, the sentence from the dataset
        flag: str, the flag to indicate how to deal with the labels for subwords
            - 'I': use the label of the first subword for all subwords but as intermediate (I-ENT)
            - 'B': use the label of the first subword for all subwords as beginning (B-ENT)
            - None: use -100 for subwords
    outputs:
        tokenized_sentence: dict, the tokenized sentence now with a field for the labels
    """
    tokenized_sentence = tokenizer(sentence['tokens'], is_split_into_words=True, truncation=True)

    labels = []
    for i, labels_s in enumerate(sentence['ner_tags']):
        word_ids = tokenized_sentence.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # if the word_idx is None, assign -100
            if word_idx is None:
                label_ids.append(-100)
            # if it is a new word, assign the corresponding label
            elif word_idx != previous_word_idx:
                label_ids.append(labels_s[word_idx])
            # if it is the same word, check the flag to assign
            else:
                if flag == 'I':
                    if label_list[labels_s[word_idx]].startswith('I'):
                      label_ids.append(labels_s[word_idx])
                    else:
                      label_ids.append(labels_s[word_idx] + 1)
                elif flag == 'B':
                    label_ids.append(labels_s[word_idx])
                elif flag == None:
                    label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_sentence['labels'] = labels
    return tokenized_sentence

In [None]:
# tokenize the dataset and align the labels
tokenized_train_fr = data_train_fr.map(tokenize_and_align_labels, batched=True)
tokenized_test_fr = data_test_fr.map(tokenize_and_align_labels, batched=True)
tokenized_val_fr = data_val_fr.map(tokenize_and_align_labels, batched=True)

In [None]:
# import the model
# model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), label2id=labels_vocab, id2label=labels_vocab_reverse)
# print(model)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
device

In [None]:
# model.to(device)

In [None]:
wand.login()

In [None]:
wandb.init(project = "Multilingual-NER-multinerd_french")

In [None]:
args = TrainingArguments(
    report_to = 'wandb',
    run_name = "multinerd-multilingual-ner_french_training",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    eval_steps=10000,
    save_steps=10000,
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
for i in range(len([0.25,0.5,0.75,1])):
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), label2id=labels_vocab, id2label=labels_vocab_reverse)
    model.to(device)
    # get 25 % of the training data
    data_train =  tokenized_train_fr.train_test_split(test_size=0.25)
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_train_fr,
        eval_dataset=tokenized_test_fr,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    outputs_train = trainer.train()
    print(outputs_train)
    outputs_eval = trainer.evaluate()
    print(outputs_eval)
    del model
    del trainer
    cuda.empty_cache()

In [None]:
wandb.finish()

In [1]:
print("Training finished")

Training finished
