In [1]:
from transformers import AutoConfig, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForTokenClassification
import pandas as pd

from named_entity_recognition.utils import read_dataset, tokenize_adjust_inputs
from named_entity_recognition.metrics import prepare_compute_metrics
from named_entity_recognition.model import XLMRobertaForTokenClassification

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
xlmr_model_name = 'xlm-roberta-base'
device = 'cuda'

### Dataset reading

In [4]:
langs = ['de', 'fr', 'it', 'en'] # languages with their fractions in final dataset
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = read_dataset(langs=langs, fracs=fracs)

In [5]:
pd.DataFrame({f'{l}_{s}': [panx_ch[l][s].num_rows] for l in langs for s in ['train', 'validation', 'test']})

Unnamed: 0,de_train,de_validation,de_test,fr_train,fr_validation,fr_test,it_train,it_validation,it_test,en_train,en_validation,en_test
0,12580,6290,6290,4580,2290,2290,1680,840,840,1180,590,590


### Model

In [6]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
id2label = {id: label for id, label in enumerate(tags.names)}
label2id = {label: id for id, label in enumerate(tags.names)}

In [7]:
xlmr_config = AutoConfig.from_pretrained(
    xlmr_model_name, 
    num_labels=tags.num_classes, 
    label2id=label2id, 
    id2label=id2label
)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

In [9]:
for lang in langs:
    panx_ch[lang] = panx_ch[lang].map(tokenize_adjust_inputs, batched=True, batch_size=None, fn_kwargs={'tokenizer': tokenizer}, remove_columns=['langs', 'ner_tags', 'tokens'])

Map:   0%|          | 0/2290 [00:00<?, ? examples/s]

In [10]:
panx_ch['de']

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 6290
    })
})

In [11]:
example = panx_ch['de']['train'][0]
pd.DataFrame({
    'tokens': tokenizer.convert_ids_to_tokens(example['input_ids']),
    'input_ids': example['input_ids'],
    'labels': example['labels']
}).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
input_ids,0,70101,176581,19,142,122,2290,708,1505,18363,...,13787,14,15263,18917,663,6947,19,6,5,2
labels,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100


### Model training

In [12]:
for lang in langs:
    
    tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
    model = XLMRobertaForTokenClassification.from_pretrained(
        xlmr_model_name,
        config=xlmr_config
    ).to(device)
    
    learning_rate = 3e-5
    weight_decay = 1e-5
    num_epochs = 5
    batch_size = 32
    logging_steps = len(panx_ch['de']['train']) // batch_size

    data_collator = DataCollatorForTokenClassification(tokenizer)
    compute_metrics = prepare_compute_metrics(id2label=id2label)

    training_args = TrainingArguments(
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        output_dir=f'{xlmr_model_name}_panx_de_multilingual_transfer',
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy='epoch',
        disable_tqdm=False,
        logging_steps=logging_steps
    )

    trainer = Trainer(
        model,
        tokenizer=tokenizer,
        args=training_args,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        train_dataset=panx_ch['de']['train'],
        eval_dataset=panx_ch[lang]['validation']
    )
    
    print(f'Model trained on "de" subset, validated on "{lang}" subset')
    trainer.train()

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['clf.1.bias', 'clf.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Model trained on "de" subset, validated on "de" subset


Epoch,Training Loss,Validation Loss,F1
1,0.2907,0.164168,0.810185
2,0.1341,0.14322,0.840231
3,0.0957,0.139495,0.853297
4,0.0733,0.14139,0.868055
5,0.0564,0.142796,0.868702


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['clf.1.bias', 'clf.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Model trained on "de" subset, validated on "fr" subset


Epoch,Training Loss,Validation Loss,F1
1,0.2869,0.672042,0.626502
2,0.1372,0.659244,0.717304
3,0.0968,0.764746,0.717153
4,0.0734,0.894019,0.71926
5,0.0564,0.916796,0.722652


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['clf.1.bias', 'clf.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Model trained on "de" subset, validated on "it" subset


Epoch,Training Loss,Validation Loss,F1
1,0.2869,0.566917,0.678193
2,0.1372,0.602084,0.706356
3,0.0968,0.766653,0.708249
4,0.0734,0.942742,0.692737
5,0.0564,0.968039,0.707395


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['clf.1.bias', 'clf.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Model trained on "de" subset, validated on "en" subset


Epoch,Training Loss,Validation Loss,F1
1,0.2869,0.748343,0.557325
2,0.1372,0.756538,0.561992
3,0.0968,0.874883,0.582334
4,0.0734,1.04248,0.566542
5,0.0564,1.062922,0.570199
