In [44]:
import numpy as np
import pandas as pd

from seqeval.metrics import f1_score, classification_report
from transformers import EvalPrediction, DataCollatorForTokenClassification, Trainer, XLMRobertaConfig

from transformers import TrainingArguments, AutoTokenizer, AutoConfig
from datasets import get_dataset_config_names, load_dataset, DatasetDict, concatenate_datasets

from collections import defaultdict, Counter

import torch.nn as nn
import torch
from torch.nn.functional import cross_entropy
import matplotlib.pyplot as plt

from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput

from sklearn.metrics import f1_score



In [2]:
# Get available datasets in XTREME
xtreme_subset = get_dataset_config_names('xtreme')
print(len(xtreme_subset))

183


In [3]:
# Filter PAN-X datasets
panx_subset = [s for s in xtreme_subset if s.startswith('PAN')]
print(panx_subset)


['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de', 'PAN-X.el', 'PAN-X.en', 'PAN-X.es', 'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi', 'PAN-X.fr', 'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it', 'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko', 'PAN-X.ml', 'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl', 'PAN-X.pt', 'PAN-X.ru', 'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te', 'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr', 'PAN-X.ur', 'PAN-X.vi', 'PAN-X.yo', 'PAN-X.zh']


In [4]:
# Load a single language dataset for testing
sample_ds = load_dataset('xtreme', name='PAN-X.de')
print(sample_ds)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})


In [5]:
language = ['hi', 'en', 'fr', 'it']
percentage_lang_spoken = [0.80, 0.90, 0.60, 0.50]

panx_combined_dataset = defaultdict(DatasetDict)


In [6]:
# Load and filter datasets
for lang, percentage in zip(language, percentage_lang_spoken):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
    for split in ['train', 'validation', 'test']:
        num_rows = int(percentage * ds[split].num_rows)
        panx_combined_dataset[lang][split] = ds[split].shuffle(seed=0).select(range(num_rows))

In [7]:
panx_combined_dataset

defaultdict(datasets.dataset_dict.DatasetDict,
            {'hi': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 4000
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 800
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 800
                 })
             }),
             'en': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 18000
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 9000
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'lang

In [8]:
# Print dataset summary
df = pd.DataFrame({
    "Language": language,
    "Num Rows": [panx_combined_dataset[lang]['train'].num_rows for lang in language]
})
print(df)



  Language  Num Rows
0       hi      4000
1       en     18000
2       fr     12000
3       it     10000


In [9]:
# Print dataset features and sample rows
element = panx_combined_dataset['hi']['train'][0]
print("Features:", panx_combined_dataset['hi']['train'].features)
print("Example Row:", element)

# Display first few rows
for i in range(5):
    row = panx_combined_dataset['hi']['train'][i]
    print(f"Row {i+1}:")
    print(f"Tokens: {row['tokens']}")
    print(f"NER Tags: {row['ner_tags']}")
    print(f"Languages: {row['langs']}")
    print("-" * 40)



Features: {'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
Example Row: {'tokens': ['**', 'काहिरा', '(', 'दूतावास', ')'], 'ner_tags': [0, 5, 0, 0, 0], 'langs': ['hi', 'hi', 'hi', 'hi', 'hi']}
Row 1:
Tokens: ['**', 'काहिरा', '(', 'दूतावास', ')']
NER Tags: [0, 5, 0, 0, 0]
Languages: ['hi', 'hi', 'hi', 'hi', 'hi']
----------------------------------------
Row 2:
Tokens: ['दाशोग़ुज़', 'प्रान्त', '(', 'Daşoguz', ')']
NER Tags: [5, 6, 0, 0, 0]
Languages: ['hi', 'hi', 'hi', 'hi', 'hi']
----------------------------------------
Row 3:
Tokens: ['इफ़्तेख़ार', '-', 'पुलिस', 'कमिश्नर']
NER Tags: [1, 0, 0, 0]
Languages: ['hi', 'hi', 'hi', 'hi']
----------------------------------------
Row 4:
Tokens: ['काजा', ',', 'स्पीति', ',', 'हिमाचल', 'प्रदेश']
NER Tags

In [10]:
# Extract tag names
tags = panx_combined_dataset['hi']['train'].features['ner_tags'].feature
print("NER Tag Mapping:", tags)

NER Tag Mapping: ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [11]:
# Function to convert tag indices to tag names
def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

# Apply function to all splits
panx_hi = {split: panx_combined_dataset['hi'][split].map(create_tag_names) for split in ['train', 'validation', 'test']}

# Print transformed dataset
print(panx_hi['train'][0])

{'tokens': ['**', 'काहिरा', '(', 'दूतावास', ')'], 'ner_tags': [0, 5, 0, 0, 0], 'langs': ['hi', 'hi', 'hi', 'hi', 'hi'], 'ner_tags_str': ['O', 'B-LOC', 'O', 'O', 'O']}


In [12]:
hi_example = panx_hi['train'][0]
pd.DataFrame([hi_example['tokens'], hi_example['ner_tags']], ['tokens', 'tags'])

Unnamed: 0,0,1,2,3,4
tokens,**,काहिरा,(,दूतावास,)
tags,0,5,0,0,0


In [13]:
panx_hi.items()

dict_items([('train', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 4000
})), ('validation', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 800
})), ('test', Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
    num_rows: 800
}))])

In [14]:
from collections import defaultdict, Counter

def get_ner_frequencies(panx_hi):
    split2freqs = defaultdict(Counter)

    for split, dataset in panx_hi.items():
        for row in dataset:
            for tag in row["ner_tags_str"]:
                # Ensure the tag follows "B-LOC", "I-PER" format
                if "-" in tag:  
                    tag_type = tag.split("-")[1]  # Extract entity type (e.g., LOC, PER)
                    split2freqs[split][tag_type] += 1

    return split2freqs

frequencies = get_ner_frequencies(panx_hi)
print(frequencies)


defaultdict(<class 'collections.Counter'>, {'train': Counter({'ORG': 5598, 'PER': 4331, 'LOC': 3181}), 'validation': Counter({'ORG': 1106, 'PER': 864, 'LOC': 691}), 'test': Counter({'ORG': 1226, 'PER': 823, 'LOC': 633})})


In [15]:
pd.DataFrame.from_dict(frequencies, orient='index')

Unnamed: 0,LOC,PER,ORG
train,3181,4331,5598
validation,691,864,1106
test,633,823,1226


## Creating token model for token classification

In [16]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # Added dropout
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        # Remove the unexpected keyword argument if it exists
        if 'num_items_in_batch' in kwargs:
            kwargs.pop('num_items_in_batch')
        output = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_output = self.dropout(output[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=output.hidden_states, attentions=output.attentions)


In [17]:
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [18]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [19]:
print(index2tag)
print(tag2index)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


In [20]:
xlm_model_name = 'xlm-roberta-base'

xlm_tokenizer = AutoTokenizer.from_pretrained(xlm_model_name)

In [21]:
xlm_config = AutoConfig.from_pretrained(
    xlm_model_name, 
    num_labels=tags.num_classes, 
    index2tag=index2tag,  # Use keyword argument format
    label2id=tag2index
)


In [22]:
import torch
import os

# Ensure CUDA is available
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.reset_max_memory_cached()

# Set CUDA Launch Blocking for debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Reload the model
xlm_my_model = XLMRobertaForTokenClassification.from_pretrained(xlm_model_name, config=xlm_config).to(device)




Using device: cuda


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
example_string = 'world will be changed with AI'
xlm_token = xlm_tokenizer(example_string).tokens()
xlm_token

['<s>', '▁world', '▁will', '▁be', '▁changed', '▁with', '▁AI', '</s>']

In [24]:
def get_ner_tags_from_text_and_model(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()

    input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
    print('input_ids', input_ids)
    
    output = model(input_ids)[0]
    print(f'shape of output {output.shape}')

    predictions = torch.argmax(output, dim=2)
    print(f'predictions : {predictions}')

    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=['token', 'NER tags'])

In [25]:
get_ner_tags_from_text_and_model(example_string, tags, xlm_my_model, xlm_tokenizer)


input_ids tensor([[    0,  8999,  1221,   186, 98816,   678, 38730,     2]],
       device='cuda:0')
shape of output torch.Size([1, 8, 7])
predictions : tensor([[6, 1, 1, 1, 1, 1, 1, 6]], device='cuda:0')


Unnamed: 0,0,1,2,3,4,5,6,7
token,<s>,▁world,▁will,▁be,▁changed,▁with,▁AI,</s>
NER tags,I-LOC,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,I-LOC


## tokenize text for ner

In [26]:
hi_example

{'tokens': ['**', 'काहिरा', '(', 'दूतावास', ')'],
 'ner_tags': [0, 5, 0, 0, 0],
 'langs': ['hi', 'hi', 'hi', 'hi', 'hi'],
 'ner_tags_str': ['O', 'B-LOC', 'O', 'O', 'O']}

In [27]:
words, labels = hi_example['tokens'], hi_example['ner_tags']

In [28]:
tokenized_input = xlm_tokenizer(hi_example['tokens'], is_split_into_words=True)

tokens = xlm_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

In [29]:
pd.DataFrame([tokens], index=['tokens'])

Unnamed: 0,0,1,2,3,4,5,6,7,8
tokens,<s>,▁**,▁का,हि,रा,▁(,▁दूतावास,▁),</s>


In [30]:
word_ids = tokenized_input.word_ids()
print(word_ids)
pd.DataFrame([tokens, word_ids], index=['tokens', 'word_ids'])

[None, 0, 1, 1, 1, 2, 3, 4, None]


Unnamed: 0,0,1,2,3,4,5,6,7,8
tokens,<s>,▁**,▁का,हि,रा,▁(,▁दूतावास,▁),</s>
word_ids,,0,1,1,1,2,3,4,


In [31]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])

    previous_word_idx = word_idx

print(label_ids)


[-100, 0, 5, -100, -100, 0, 0, 0, -100]


In [32]:
labels = [index2tag[l] if l != -100 else 'IGN' for l in label_ids]

index = ['tokens', 'word_ids', 'label_ids', 'labels']

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8
tokens,<s>,▁**,▁का,हि,रा,▁(,▁दूतावास,▁),</s>
word_ids,,0,1,1,1,2,3,4,
label_ids,-100,0,5,-100,-100,0,0,0,-100
labels,IGN,O,B-LOC,IGN,IGN,O,O,O,IGN


In [33]:
num_labels = len(index2tag)  # Define index2tag list before this
xlm_config = XLMRobertaConfig.from_pretrained(xlm_model_name, num_labels=num_labels)

In [34]:
def tokenize_mask_modify_labels(examples):
    """Tokenizes input text and aligns NER labels for a batch of examples."""
    tokenized_inputs = xlm_tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    all_labels = []
    for batch_idx, tokens in enumerate(examples["tokens"]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_idx)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)  # Ignore subwords
            else:
                label_ids.append(examples["ner_tags"][batch_idx][word_idx])
            previous_word_idx = word_idx
        all_labels.append(label_ids)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [35]:
def encode_panx_datasets(corpus):
    return corpus.map(tokenize_mask_modify_labels, batched=True, remove_columns=['langs', 'ner_tags', 'tokens'])

# Load and encode dataset for Hindi
panx_hi_encoded = encode_panx_datasets(panx_combined_dataset['hi'])

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [36]:
def generate_list_for_compute_metrics(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    pred_label_list, true_label_list = [], []
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        pred_label_list.append(example_preds)
        true_label_list.append(example_labels)
    return pred_label_list, true_label_list


In [37]:
def compute_metrics(eval_pred: EvalPrediction):
    y_pred, y_true = generate_list_for_compute_metrics(eval_pred.predictions, eval_pred.label_ids)
    return {'f1': f1_score(y_true, y_pred, average="macro")}

In [38]:
training_args = TrainingArguments(
    output_dir="output_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False
)




In [39]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=xlm_tokenizer, padding=True, return_tensors="pt"
)


In [40]:
def model_init():
    # Instantiate your custom model using the configuration
    model = XLMRobertaForTokenClassification(xlm_config)
    # Load pretrained RoBERTa weights and update the roberta component of your model
    pretrained_roberta = RobertaModel.from_pretrained(xlm_model_name, config=xlm_config)
    model.roberta.load_state_dict(pretrained_roberta.state_dict())
    return model


In [41]:
trainer = Trainer(
    model=model_init(), 
    args=training_args,
    data_collator=data_collator,
    train_dataset=panx_hi_encoded['train'],
    eval_dataset=panx_hi_encoded['validation'],
    tokenizer=xlm_tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [42]:
print(panx_hi_encoded['train'][0])  # Check dataset format


{'input_ids': [0, 16459, 641, 15159, 2815, 15, 220085, 1388, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [-100, 0, 5, -100, -100, 0, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100

In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
pd = pd.DataFrame(trainer.state.log_history)[['epochs', 'loss', 'eval_loss', 'eval_f1']]

df = df.rename(columns={'epochs': 'Epoch', 'Loss': 'Training Loss', 'Eval Loss': 'Validation Loss', 'eval F1': 'F1'})

df['Epoch'] = df['Epoch'].apply(lambda x: round(x))

df['Training Loss'] = df['Training Loss'].ffill()

df[['Validation Loss', 'F1']] = df[['Vallidation Loss', 'F1']].bfill().ffill()

df.drop_duplicates()

In [None]:
random_german_sentence = "Die alten Mauern erzählen Geschichten aus vergangenen Zeiten."
get_ner_tags_from_text_and_model(random_german_sentence, tags, trainer.model, xlm_tokenizer)


## error analysis

In [None]:
valid_set_batch = panx_hi_encoded['validation']
valid_set_batch

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [None]:
valid_set_batch.features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [None]:
def get_forward_loss_and_labels(batch):
  features = [dict(zip(batch, t)) for t in zip(*batch.values())]

  batch = data_collator(features)

  input_ids = batch['input_ids'].to(device)
  attention_mask =  batch['attention_mask'].to(device)
  labels = batch['labels'].to(device)

  with torch.no_grad():
    output = trainer.model(input_ids, attention_mask)
    predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()
    loss = cross_entropy(output.logits.view(-1, 7), labels.view(-1), reduction='none')

  loss = loss.view(len(input_ids), -1).cpu().numpy()

  return {'loss': loss, 'predicted_label': predicted_label}


In [None]:
valid_set_with_loss = valid_set_batch.map(get_forward_loss_and_labels, batched=True, batch_size=32)
df = valid_set_with_loss.to_pandas()
df.head()

In [None]:
def get_f1_score(trainer, dataset):
    return trainer.predict(dataset).metrics['test_f1']

In [None]:
f1_scores = defaultdict(dict)

f1_scores['hi']['hi'] = get_f1_score(trainer, panx_hi_encoded['test'])

f1_scores['hi']['hi']

In [None]:
def evaluate_lang_performance(lang, trainer):
    panx_ds = encode_panx_datasets(panx_combined_dataset[lang])
    return get_f1_score(trainer, panx_ds['test'])

In [None]:
f1_scores['hi']['de'] = evaluate_lang_performance('de', trainer)
f1_scores['hi']['de']

In [None]:
panx_fr_encoded = encode_panx_datasets(panx_combined_dataset['fr'])


In [None]:
metrics_df = fine_tuning_training_on_single_corpus(panx_fr_encoded, 250)
metrics_df

In [None]:
for num_samples in [500, 1000, 2000, 4000]:
    metrics_df = metrics_df.append(fine_tuning_training_on_single_corpus(panx_fr_encoded, num_samples), ignore_index=True)


In [None]:
fig, ax = plt.Subplots()
ax.axhline(f1_score['hi']['de'], ls='--', color='r')
metrics_df.set_index('num samples').plot(ax=ax)

plt.legend(['zero-shot from hindi dataset', 'fine tuned on german ds'], loc='lower right')

plt.ylim((0, 1))
plt.xlabel('number of training samples')
plt.ylabel('f1 score')
plt.show()

In [None]:
def cocatenate_splits(corpora):
    multi_corpus = DatasetDict()
    for train_val_test_split in corpora[0].keys():
        multi_corpus[train_val_test_split] = concatenate_datasets(
            [corpus[train_val_test_split] for corpus in corpora]
        ).shuffle(seed=42)

    return multi_corpus

In [None]:
panx_hi_fr_concatenated_encoded = cocatenate_splits([panx_hi_encoded, panx_fr_encoded])


In [None]:

training_args.logging_steps = len(panx_hi_fr_concatenated_encoded["train"]) // batch_size
# training_args.push_to_hub = True
training_args.push_to_hub = False  # PAUL - Changing to False
training_args.output_dir = "xlm-roberta-base-finetuned-panx-de-fr"

trainer = Trainer(model_init=model_init, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlm_tokenizer, train_dataset=panx_hi_fr_concatenated_encoded["train"],
    eval_dataset=panx_hi_fr_concatenated_encoded["validation"])

trainer.train()

In [None]:

for lang in language:
    f1 = evaluate_lang_performance(lang, trainer)
    print(f"F1-score of [de-fr] model on [{lang}] dataset: {f1:.3f}")

In [None]:
corpora = [panx_hi_encoded]

for lang in language[1:]:
    training_args.output_dir = f'xlm-roberta-base -finetuned-panx-hi-{lang}'

    hi_encoded = encode_panx_datasets(panx_hi_encoded[lang])
    metrics = fine_tuning_training_on_single_corpus(hi_encoded, hi_encoded['train'].num_rows)

    f1_scores[lang][lang] = metrics['f1 score'][0] 

    corpora.append(hi_encoded)

In [None]:
corpora_encoded = concatenate_splits(corpora)

In [None]:
training_args.logging_steps = len(corpora_encoded["train"]) // batch_size
training_args.output_dir = "xlm-roberta-base-finetuned-panx-all"

trainer = Trainer(model_init=model_init, args=training_args,
    data_collator=data_collator, compute_metrics=compute_metrics,
    tokenizer=xlmr_tokenizer, train_dataset=corpora_encoded["train"],
    eval_dataset=corpora_encoded["validation"])

trainer.train()

In [None]:
scores_data = {"de": f1_scores["de"],
               "each": {lang: f1_scores[lang][lang] for lang in languages},
               "all": f1_scores["all"]}
f1_scores_df = pd.DataFrame(scores_data).T.round(4)
f1_scores_df.rename_axis(index="Fine-tune on", columns="Evaluated on",
                         inplace=True)
f1_scores_df