In [1]:
from datetime import datetime

from datasets import load_dataset, load_from_disk
from transformers import BatchEncoding, PreTrainedTokenizer, AutoTokenizer, Trainer, TrainingArguments
from transformers.data import data_collator

from modelling_xlm_roberta import XLMRobertaForTokenClassification
import nervaluate

from functools import partial
import torch

from typing import Iterable
from torch import Tensor
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import wandb

device = 'cuda'
model_dtype = torch.bfloat16
torch.cuda.get_device_name(0)

AssertionError: Torch not compiled with CUDA enabled

# 1. Test that layer cutting works

In [2]:
model_test = XLMRobertaForTokenClassification.from_pretrained('facebook/xlm-v-base')
model_test

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at facebook/xlm-v-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(901629, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

In [3]:
model_test = XLMRobertaForTokenClassification.from_pretrained('facebook/xlm-v-base', skip_last_layer=True)
model_test

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at facebook/xlm-v-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(901629, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-10): 11 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

Works! Passing `skip_last_layer=True` removes the last layer in the transformer stack (11 x XLMRobertaLayer instead of 12 x XLMRobertaLayer)

# 2. Train models on the downstream tagging task and evaluate the knowledge transfer to a different language
For this we will use CoNLL 2003 corpus (`eriktks/conll2003`, 14k examples) to train the model and Afrikaans NER Corpus (`nwu-ctext/afrikaans_ner_corpus`, 9k examples) to test the model. The validation is done over CoNLL 2003, only the final scores for Afrikaans are reported.

In [4]:
train_dataset = load_dataset('eriktks/conll2003', split='train')
valid_dataset = load_dataset('eriktks/conll2003', split='validation')
test_dataset = load_dataset('nwu-ctext/afrikaans_ner_corpus', split='train')

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/5.82k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/945k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8962 [00:00<?, ? examples/s]

Make sure that the labelling scheme is identical across datasets

In [5]:
train_dataset.features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [6]:
valid_dataset.features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
test_dataset.features['ner_tags']

Sequence(feature=ClassLabel(names=['OUT', 'B-PERS', 'I-PERS', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

The names are a bit different, but otherwise the schemes are identical

## 2.1 Convert word-level tags to subtoken-level tags

In [2]:
xlm_tok = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
xlm_tok_name = 'xlm-v'

xlm_tok('test <mask> test', return_offsets_mapping=True)



{'input_ids': [0, 1340, 901628, 1340, 2], 'attention_mask': [1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 4), (4, 11), (11, 16), (0, 0)]}

In [2]:
# for reference
ner_tags_scheme = np.array(['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'])
ner_tags_ext    =          [  0,       2,       2,       4,       4,       6,       6,        8,        8]
# the ext is used when we need to split one word into multiple sub tokens

In [9]:
def tokenize(example: dict, tokenizer: PreTrainedTokenizer, tokenizer_name: str, max_length: int = 512) -> dict:
    ner_tags: list[int] = example['ner_tags']
    example_words: list[str] = example['tokens']
    text = ' '.join(example_words)
    
    # map words to positions in text
    word_positions: list[int] = example.get('word_positions', [])
    
    if len(word_positions) != len(example_words):
        text_iterator = 0
        for word in example_words:
            while text[text_iterator:text_iterator + len(word)] != word:
                text_iterator += 1
                assert text_iterator < len(text)
            
            word_positions.append(text_iterator)
    
    encoding: BatchEncoding = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=max_length)
    num_sub_tokens = len(encoding.offset_mapping)
    
    sub_token_iterator = 0
    sub_token_ner_tags: list[int] = []
    for word_id, ner_tag in enumerate(ner_tags):
        word_start = word_positions[word_id]
        word_end = word_start + len(example_words[word_id])
        
        # there may be some empty space between words. the sub tokens that include this empty space receive O label
        # we compare with the end ([1]) to ensure that 0-length tokens are labelled as O (for example <CLS>)
        while sub_token_iterator < num_sub_tokens and  encoding.offset_mapping[sub_token_iterator][1] <= word_start:
            if encoding.offset_mapping[sub_token_iterator][1] - encoding.offset_mapping[sub_token_iterator][0] == 0:
                # set to -100 for special tokens like <CLS>
                sub_token_ner_tags.append(-100)
            else:
                sub_token_ner_tags.append(0)  # 0 = O
            sub_token_iterator += 1
            
        ext_tag = ner_tags_ext[ner_tag]
        
        if sub_token_iterator < num_sub_tokens:
            # the first sub token of a word receives original label, the rest receive extended label
            sub_token_ner_tags.append(ner_tag)
            sub_token_iterator += 1
        
        # again, we need to be careful about 0-length tokens, so we compare start ([0]) with the word end
        while sub_token_iterator < num_sub_tokens and encoding.offset_mapping[sub_token_iterator][0] < word_end:
            
            # there is a weird quirk with transformers tokenizers: <SEP> token has (0, 0) offset 
            #   regardless of its real position, see https://github.com/huggingface/transformers/issues/35125
            if encoding.offset_mapping[sub_token_iterator][1] - encoding.offset_mapping[sub_token_iterator][0] == 0:
                sub_token_ner_tags.append(-100)
            else:
                sub_token_ner_tags.append(ext_tag)
                
            sub_token_iterator += 1
    
    # any tokens at the end (like <SEP>) receive O tokens
    while sub_token_iterator < num_sub_tokens:
        sub_token_iterator += 1
        sub_token_ner_tags.append(0)
        
    return {
        'word_positions': word_positions,
        f'{tokenizer_name}_sub_tokens': encoding.input_ids,
        f'{tokenizer_name}_sub_token_offsets': encoding.offset_mapping,
        f'{tokenizer_name}_sub_token_ner_tags': sub_token_ner_tags,
        'length': len(encoding.offset_mapping)
    }

tokenize_fn = partial(tokenize, tokenizer=xlm_tok, tokenizer_name=xlm_tok_name, max_length=512)

train_dataset = train_dataset.map(tokenize_fn)
valid_dataset = valid_dataset.map(tokenize_fn)
test_dataset = test_dataset.map(tokenize_fn)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/8962 [00:00<?, ? examples/s]

In [10]:
for test_idx in range(25):
    ner_tags = torch.as_tensor(train_dataset[test_idx]['xlm-v_sub_token_ner_tags'])
    tokens = torch.as_tensor(train_dataset[test_idx]['xlm-v_sub_tokens'])
    print('Text:', ' '.join(train_dataset[test_idx]['tokens']))
    print('Ents:', xlm_tok.decode(tokens[ner_tags > 0]))
    print()

Text: EU rejects German call to boycott British lamb .
Ents: EU German British

Text: Peter Blackburn
Ents: Peter Blackburn

Text: BRUSSELS 1996-08-22
Ents: BRUSSELS

Text: The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .
Ents: European Commission German British

Text: Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
Ents: Germany European Union Werner Zwingmann Britain

Text: " We do n't support any such recommendation because we do n't see any grounds for it , " the Commission 's chief spokesman Nikolaus van der Pas told a news briefing .
Ents: Commission Nikolaus van der Pas

Text: He said further scientific study was required and if it was found that action was needed it should be ta

Looks nice!

In [18]:
train_dataset.save_to_disk('data/train')
valid_dataset.save_to_disk('data/valid')
test_dataset.save_to_disk('data/test')

Saving the dataset (0/1 shards):   0%|          | 0/14041 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8962 [00:00<?, ? examples/s]

In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, examples: Iterable[dict], tokenizer_name: str):
        self.input_ids = []
        self.labels = []
        
        for example in examples:
            self.input_ids.append(torch.as_tensor(example[f'{tokenizer_name}_sub_tokens']))
            self.labels.append(torch.as_tensor(example[f'{tokenizer_name}_sub_token_ner_tags']))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]
    

def collate_fn(inputs: list[(Tensor, Tensor)], *, pad_token: int) -> dict:
    all_input_ids = []
    all_labels = []
    for input_ids, labels in inputs:
        all_input_ids.append(input_ids)
        all_labels.append(labels)
    
    input_ids = pad_sequence(all_input_ids, batch_first=True, padding_value=pad_token)
    
    batch_size, seq_length = input_ids.shape

    # do not attend to pad and pad does not attend to anything
    pad_mask = (input_ids != pad_token)
    attention_mask = (pad_mask.reshape(batch_size, 1, -1) != pad_mask.reshape(batch_size, -1, 1))
    return {
        'input_ids': input_ids,
        'labels': pad_sequence(all_labels, batch_first=True, padding_value=-100),
        'attention_mask': attention_mask
    }

In [7]:
def compute_ner_metrics(eval_pred) -> dict:
    predictions, labels = eval_pred
    
    predictions = np.argmax(predictions, axis=-1)
    padding = (labels < 0)
    
    predictions = predictions[~padding]
    labels = labels[~padding]

    predictions = ner_tags_scheme[predictions]
    labels = ner_tags_scheme[labels]

    evaluator = nervaluate.Evaluator([labels], [predictions], tags=['PER', 'LOC', 'ORG', 'MISC'], loader='list')
    results, results_per_tag, _, _ = evaluator.evaluate()

    overall_metrics = results["strict"]
    
    metrics = {
        'overall_precision': overall_metrics['precision'],
        'overall_recall': overall_metrics['recall'],
        'overall_f1': overall_metrics['f1'],
    }
    
    for tag, tag_metrics in results_per_tag.items():
        metrics[f'{tag}_precision'] = tag_metrics['strict']['precision']
        metrics[f'{tag}_recall'] = tag_metrics['strict']['recall']
        metrics[f'{tag}_f1'] = tag_metrics['strict']['f1']

    # Return desired metrics
    return metrics

## 2.2 Train a conventional model

In [8]:
n_run = 0
model_name = 'facebook/xlm-v-base'

In [30]:
run_name = f'xlm-v-base-finetuned-l12-conll03/{datetime.now().strftime("%m-%d")}/{n_run}'
wandb.init(
    project='ner-alignment',
    name=run_name,
    dir=run_name,
    resume=False
)
n_run += 1

model = XLMRobertaForTokenClassification.from_pretrained(
    'facebook/xlm-v-base', 
    num_labels=9, 
    classifier_dropout=0.1, 
    hidden_dropout_prob=0.1
)
model.roberta.embeddings.requires_grad_(False)  # freeze input embeddings to avoid parameter shift (training on english and inferencing on africaans -> different tokens are activated)
print(f"Percentage of frozen modules: {100 * sum(1 for module in model.modules() if not any(p.requires_grad for p in module.parameters())) / sum(1 for module in model.modules()):.2f}%")
print(f"Percentage of frozen parameters: {100 * sum(p.numel() for p in model.parameters() if not p.requires_grad) / sum(p.numel() for p in model.parameters()):.2f}%")


xlm_tok = AutoTokenizer.from_pretrained('facebook/xlm-v-base')


trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=run_name,
        overwrite_output_dir=True,
        eval_strategy='steps',
        eval_delay=0.001,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=128,
        learning_rate=2e-5,
        max_grad_norm=10,
        max_steps=5000,
        lr_scheduler_type='cosine',
        lr_scheduler_kwargs={ 'num_cycles': 0.5 },
        warmup_ratio=0.05,
        adam_epsilon=1e-8,
        adam_beta1=0.9,
        adam_beta2=0.999,
        weight_decay=0.01,
        logging_steps=100,
        eval_steps=200,
        dataloader_num_workers=4,
        torch_compile=False,
        include_num_input_tokens_seen=True,
        disable_tqdm=True,
        report_to='wandb'
    ),
    data_collator=partial(collate_fn, pad_token=xlm_tok.pad_token_id),
    train_dataset=Dataset(load_from_disk('data/train'), tokenizer_name='xlm-v'),
    eval_dataset=Dataset(load_from_disk('data/valid'), tokenizer_name='xlm-v'),
    compute_metrics=compute_ner_metrics
)
trainer.train()
wandb.finish()



VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/LOC_f1,▁▁
eval/LOC_precision,▁▁
eval/LOC_recall,▁▁
eval/MISC_f1,▁▁
eval/MISC_precision,▁▁
eval/MISC_recall,▁▁
eval/ORG_f1,▁▁
eval/ORG_precision,▁▁
eval/ORG_recall,▁▁
eval/PER_f1,▁▁

0,1
eval/LOC_f1,0.0
eval/LOC_precision,0.0
eval/LOC_recall,0.0
eval/MISC_f1,0.0
eval/MISC_precision,0.0
eval/MISC_recall,0.0
eval/ORG_f1,0.0
eval/ORG_precision,0.0
eval/ORG_recall,0.0
eval/PER_f1,0.0




Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at facebook/xlm-v-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


{'loss': 1.8089, 'grad_norm': 20.174489974975586, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.11389521640091116, 'num_input_tokens_seen': 76672}
{'loss': 1.1744, 'grad_norm': 2.2620325088500977, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.22779043280182232, 'num_input_tokens_seen': 158272}
{'eval_loss': 0.95058673620224, 'eval_overall_precision': 0.010752688172043012, 'eval_overall_recall': 0.00016809547823163558, 'eval_overall_f1': 0.00033101621979476995, 'eval_PER_precision': 0.0, 'eval_PER_recall': 0.0, 'eval_PER_f1': 0, 'eval_LOC_precision': 0.0, 'eval_LOC_recall': 0.0, 'eval_LOC_f1': 0, 'eval_ORG_precision': 0.0, 'eval_ORG_recall': 0.0, 'eval_ORG_f1': 0, 'eval_MISC_precision': 0.034482758620689655, 'eval_MISC_recall': 0.0010799136069114472, 'eval_MISC_f1': 0.0020942408376963353, 'eval_runtime': 0.9845, 'eval_samples_per_second': 3301.272, 'eval_steps_per_second': 26.41, 'epoch': 0.22779043280182232, 'num_input_tokens_seen': 158272}
{'loss': 0.7959, 'grad_norm': 6.60

TrainOutput(global_step=5000, training_loss=0.2733513090133667, metrics={'train_runtime': 448.4382, 'train_samples_per_second': 178.397, 'train_steps_per_second': 11.15, 'train_loss': 0.2733513090133667, 'epoch': 5.694760820045558, 'num_input_tokens_seen': 3906776})

In [31]:
test_results_l12 = trainer.evaluate(Dataset(load_from_disk('data/test'), tokenizer_name='xlm-v'), metric_key_prefix='transfer')
test_results_l12

{'transfer_loss': 0.6437795758247375, 'transfer_overall_precision': 0.08365958539246518, 'transfer_overall_recall': 0.18153331940672654, 'transfer_overall_f1': 0.11453550952265888, 'transfer_PER_precision': 0.049324023176348236, 'transfer_PER_recall': 0.16093068347067377, 'transfer_PER_f1': 0.07550602683647942, 'transfer_LOC_precision': 0.1402452140245214, 'transfer_LOC_recall': 0.35726027397260274, 'transfer_LOC_f1': 0.2014210688909484, 'transfer_ORG_precision': 0.09822494627079519, 'transfer_ORG_recall': 0.34421199442119943, 'transfer_ORG_f1': 0.1528362645528858, 'transfer_MISC_precision': 0.053885579720182854, 'transfer_MISC_recall': 0.05647502903600465, 'transfer_MISC_f1': 0.05514992556886652, 'transfer_runtime': 104.7546, 'transfer_samples_per_second': 85.552, 'transfer_steps_per_second': 0.678, 'epoch': 5.694760820045558, 'num_input_tokens_seen': 3906776}


{'transfer_loss': 0.6437795758247375,
 'transfer_overall_precision': 0.08365958539246518,
 'transfer_overall_recall': 0.18153331940672654,
 'transfer_overall_f1': 0.11453550952265888,
 'transfer_PER_precision': 0.049324023176348236,
 'transfer_PER_recall': 0.16093068347067377,
 'transfer_PER_f1': 0.07550602683647942,
 'transfer_LOC_precision': 0.1402452140245214,
 'transfer_LOC_recall': 0.35726027397260274,
 'transfer_LOC_f1': 0.2014210688909484,
 'transfer_ORG_precision': 0.09822494627079519,
 'transfer_ORG_recall': 0.34421199442119943,
 'transfer_ORG_f1': 0.1528362645528858,
 'transfer_MISC_precision': 0.053885579720182854,
 'transfer_MISC_recall': 0.05647502903600465,
 'transfer_MISC_f1': 0.05514992556886652,
 'transfer_runtime': 104.7546,
 'transfer_samples_per_second': 85.552,
 'transfer_steps_per_second': 0.678,
 'epoch': 5.694760820045558,
 'num_input_tokens_seen': 3906776}

In [32]:
run_name = f'xlm-v-base-finetuned-l11-conll03/{datetime.now().strftime("%m-%d")}/{n_run}'
wandb.init(resume=False, reinit=True, dir=run_name, project='ner-alignment')
n_run += 1
model = XLMRobertaForTokenClassification.from_pretrained('facebook/xlm-v-base', num_labels=9, classifier_dropout=0.1, hidden_dropout_prob=0.1, skip_last_layer=True)
model.roberta.embeddings.requires_grad_(False)  # freeze input embeddings to avoid parameter shift (training on english and inferencing on africaans -> different tokens are activated)
xlm_tok = AutoTokenizer.from_pretrained('facebook/xlm-v-base')
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=run_name,
        overwrite_output_dir=True,
        eval_strategy='steps',
        eval_delay=0.001,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=128,
        learning_rate=2e-5,
        max_grad_norm=10,
        max_steps=5000,
        lr_scheduler_type='cosine',
        lr_scheduler_kwargs={ 'num_cycles': 0.5 },
        warmup_ratio=0.05,
        adam_epsilon=1e-8,
        adam_beta1=0.9,
        adam_beta2=0.999,
        weight_decay=0.01,
        logging_steps=100,
        bf16=True,
        eval_steps=200,
        dataloader_num_workers=4,
        torch_compile=True,
        include_num_input_tokens_seen=True,
        disable_tqdm=True
    ),
    data_collator=partial(collate_fn, pad_token=xlm_tok.pad_token_id),
    train_dataset=Dataset(load_from_disk('data/train'), tokenizer_name='xlm-v'),
    eval_dataset=Dataset(load_from_disk('data/valid'), tokenizer_name='xlm-v'),
    compute_metrics=compute_ner_metrics
)
trainer.train()



VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/LOC_f1,▁▁▁▅▇▇▇██████▇███████████
eval/LOC_precision,▁▁▁▅▇▇▇██████▇██▇█▇██████
eval/LOC_recall,▁▁▁▅▇▇▇▇████▇▇███▇██▇████
eval/MISC_f1,▁▁▁▁▁▁▃▆▇▇████▇██████████
eval/MISC_precision,▂▁▁▁▁▁▃▆▇▇███▇▇███▇▇▇██▇█
eval/MISC_recall,▁▁▁▁▁▁▃▅▆▆▇▇▇▇▇▇▇▇████▇▇█
eval/ORG_f1,▁▁▁▂▄▅▅▇▆▇▇▇▇▇▇█▇█▇██▇███
eval/ORG_precision,▁▁▂▂▄▅▅▇▆▇▇▇▇▇▇█▇█▇▇█▇█▇█
eval/ORG_recall,▁▁▁▂▄▅▅█▆▆█▇▇▇██▇█▇██▇███
eval/PER_f1,▁▁▆▃▆▅▆▆▆▇▇▇▇█▇▇█▇███████

0,1
eval/LOC_f1,0.65123
eval/LOC_precision,0.58959
eval/LOC_recall,0.72727
eval/MISC_f1,0.47735
eval/MISC_precision,0.42058
eval/MISC_recall,0.55184
eval/ORG_f1,0.4714
eval/ORG_precision,0.39585
eval/ORG_recall,0.58259
eval/PER_f1,0.55253




Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at facebook/xlm-v-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


{'loss': 1.3132, 'grad_norm': 2.603902578353882, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.11389521640091116, 'num_input_tokens_seen': 76672}
{'loss': 0.7704, 'grad_norm': 3.464730978012085, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.22779043280182232, 'num_input_tokens_seen': 158272}
{'eval_loss': 0.5994405746459961, 'eval_overall_precision': 0.27987543791358505, 'eval_overall_recall': 0.12086064884854597, 'eval_overall_f1': 0.168818971589575, 'eval_PER_precision': 0.40070921985815605, 'eval_PER_recall': 0.30673181324647125, 'eval_PER_f1': 0.34747847478474786, 'eval_LOC_precision': 0.19207317073170732, 'eval_LOC_recall': 0.06859009254218836, 'eval_LOC_f1': 0.10108303249097474, 'eval_ORG_precision': 0.06796116504854369, 'eval_ORG_recall': 0.020833333333333332, 'eval_ORG_f1': 0.03189066059225513, 'eval_MISC_precision': 0.0, 'eval_MISC_recall': 0.0, 'eval_MISC_f1': 0, 'eval_runtime': 3.5957, 'eval_samples_per_second': 903.866, 'eval_steps_per_second': 7.231, 'epoch': 0.

TrainOutput(global_step=5000, training_loss=0.19429620170593262, metrics={'train_runtime': 430.9478, 'train_samples_per_second': 185.637, 'train_steps_per_second': 11.602, 'train_loss': 0.19429620170593262, 'epoch': 5.694760820045558, 'num_input_tokens_seen': 3906776})

In [33]:
test_results_l12 = trainer.evaluate(Dataset(load_from_disk('data/test'), tokenizer_name='xlm-v'), metric_key_prefix='transfer')
test_results_l12

{'transfer_loss': 0.6994850039482117, 'transfer_overall_precision': 0.07787755102040816, 'transfer_overall_recall': 0.19928974305410488, 'transfer_overall_f1': 0.11199154780771264, 'transfer_PER_precision': 0.05136986301369863, 'transfer_PER_recall': 0.15996122152205525, 'transfer_PER_f1': 0.07776599505125485, 'transfer_LOC_precision': 0.12736799677549376, 'transfer_LOC_recall': 0.3463013698630137, 'transfer_LOC_f1': 0.18623839693531755, 'transfer_ORG_precision': 0.08954361640670133, 'transfer_ORG_recall': 0.3891213389121339, 'transfer_ORG_f1': 0.1455854727614277, 'transfer_MISC_precision': 0.0516096065406234, 'transfer_MISC_recall': 0.07331591173054587, 'transfer_MISC_f1': 0.060576980747316024, 'transfer_runtime': 120.9374, 'transfer_samples_per_second': 74.104, 'transfer_steps_per_second': 0.587, 'epoch': 5.694760820045558, 'num_input_tokens_seen': 3906776}


{'transfer_loss': 0.6994850039482117,
 'transfer_overall_precision': 0.07787755102040816,
 'transfer_overall_recall': 0.19928974305410488,
 'transfer_overall_f1': 0.11199154780771264,
 'transfer_PER_precision': 0.05136986301369863,
 'transfer_PER_recall': 0.15996122152205525,
 'transfer_PER_f1': 0.07776599505125485,
 'transfer_LOC_precision': 0.12736799677549376,
 'transfer_LOC_recall': 0.3463013698630137,
 'transfer_LOC_f1': 0.18623839693531755,
 'transfer_ORG_precision': 0.08954361640670133,
 'transfer_ORG_recall': 0.3891213389121339,
 'transfer_ORG_f1': 0.1455854727614277,
 'transfer_MISC_precision': 0.0516096065406234,
 'transfer_MISC_recall': 0.07331591173054587,
 'transfer_MISC_f1': 0.060576980747316024,
 'transfer_runtime': 120.9374,
 'transfer_samples_per_second': 74.104,
 'transfer_steps_per_second': 0.587,
 'epoch': 5.694760820045558,
 'num_input_tokens_seen': 3906776}

In [35]:
dataset = load_from_disk('data/valid')

In [43]:
test_idx = 3
ner_tags = torch.as_tensor(dataset[test_idx]['xlm-v_sub_token_ner_tags'])
tokens = torch.as_tensor(dataset[test_idx]['xlm-v_sub_tokens'])
print('Text:', ' '.join(dataset[test_idx]['tokens']))
print('Ents:', xlm_tok.decode(tokens[ner_tags > 0]))
print()

Text: Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .
Ents: Essex Derbyshire Surrey Kent Nottinghamshire

