In [1]:
# Install the necessary libraries
!pip install transformers[torch] -q datasets -q accelerate -U -q seqeval -q

## Dataset
### Data Analysis and Pre-Processing

In [2]:
# cloning the github repo that has the datasets available in apache arrow format
!git clone https://github.com/Vedakashyap7/LJMU_Thesis.git
# paste the datasets inside google colaboratory environment
!cp -r /content/LJMU_Thesis/dataset /content/dataset
# remove the cloned github repo to save disk space in colab environment
!rm -r /content/LJMU_Thesis

Cloning into 'LJMU_Thesis'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 92 (delta 44), reused 89 (delta 44), pack-reused 0[K
Receiving objects: 100% (92/92), 3.57 MiB | 21.50 MiB/s, done.
Resolving deltas: 100% (44/44), done.


In [3]:
# loading the data
from datasets import load_from_disk, DatasetDict
from collections import defaultdict

panx_main = defaultdict(DatasetDict)

panx_main['en'] = load_from_disk("/content/dataset/english")
panx_main['de'] = load_from_disk("/content/dataset/german")
panx_main['fr'] = load_from_disk("/content/dataset/french")
panx_main['es'] = load_from_disk("/content/dataset/spanish")
panx_main['it'] = load_from_disk("/content/dataset/italian")

In [4]:
panx_main['en']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
})

In [5]:
panx_main['de']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [6]:
panx_main['fr']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [7]:
panx_main['es']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [8]:
panx_main['it']

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 500
    })
})

In [9]:
# Check a random sample in the train split of PANX-english subset
element = panx_main['en']['train'][10]
for key, value in element.items():
  print(f"{key}: {value}")

tokens: ['Its', 'source', 'is', 'near', 'Mega', 'Dereio', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0]
langs: ['en', 'en', 'en', 'en', 'en', 'en', 'en']


In [10]:
# Check the features in train split of english
for key, value in panx_main['en']['train'].features.items():
  print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [11]:
# Check the class labels
tags = panx_main['en']['train'].features['ner_tags'].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [12]:
# Visualize the Tokens and their respective tags in english
import pandas as pd

def create_tag_names(batch):
  return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

panx_en = panx_main['en'].map(create_tag_names)

en_example = panx_en['train'][10]
pd.DataFrame([en_example['tokens'],en_example['ner_tags_str']],
             ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6
Tokens,Its,source,is,near,Mega,Dereio,.
Tags,O,O,O,O,B-LOC,I-LOC,O


In [13]:
# Visualize the Tokens and their respective tags in German
panx_de = panx_main['de'].map(create_tag_names)

de_example = panx_de['train'][1]
pd.DataFrame([de_example['tokens'],de_example['ner_tags_str']],
             ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Tokens,Sie,geht,hinter,Walluf,nahtlos,in,die,Bundesautobahn,66,über,.
Tags,O,O,O,B-ORG,O,O,O,B-ORG,I-ORG,O,O


In [14]:
# Visualize the Tokens and their respective tags in French
panx_fr = panx_main['fr'].map(create_tag_names)

fr_example = panx_fr['train'][17]
pd.DataFrame([fr_example['tokens'],fr_example['ner_tags_str']],
             ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4
Tokens,@,Warriors,de,Golden,State
Tags,O,B-ORG,I-ORG,I-ORG,I-ORG


In [15]:
# Visualize the Tokens and their respective tags in Spanish
panx_es = panx_main['es'].map(create_tag_names)

es_example = panx_es['train'][17]
pd.DataFrame([es_example['tokens'],es_example['ner_tags_str']],
             ['Tokens','Tags'])

Unnamed: 0,0,1,2,3
Tokens,Winnipeg,",",Canadá,.
Tags,B-LOC,O,B-LOC,O


In [16]:
# Visualize the Tokens and their respective tags in Italian
panx_it = panx_main['it'].map(create_tag_names)

it_example = panx_it['train'][17]
pd.DataFrame([it_example['tokens'],it_example['ner_tags_str']],
             ['Tokens','Tags'])

Unnamed: 0,0,1,2,3,4,5,6
Tokens,È,inoltre,presente,Arabia,Saudita,sud-occidentale,.
Tags,O,O,O,B-LOC,I-LOC,O,O


In [17]:
# Check the classes distribution in the data in English dataset
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_en.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ORG,PER,LOC
train,4681,4625,4592
validation,2340,2284,2378
test,2341,2350,2385


In [18]:
# Check the classes distribution in the data in German dataset
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,474,416,505
validation,246,214,254
test,239,235,271


In [19]:
# Check the classes distribution in the data in French dataset
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_fr.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,527,373,420
validation,248,176,223
test,271,206,214


In [20]:
# Check the classes distribution in the data in Spanish dataset
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_es.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ORG,PER,LOC
train,375,400,458
validation,191,180,212
test,163,211,231


In [21]:
# Check the classes distribution in the data in Italian dataset
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_it.items():
  for row in dataset["ner_tags_str"]:
    for tag in row:
      if tag.startswith("B"):
        tag_type = tag.split("-")[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,ORG,PER,LOC
train,405,492,473
validation,225,267,215
test,202,219,231


In [22]:
label_names = tags.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

## Importing the tokenizers and Model classes from transformers library
### Making use of HuggingFace library's AutoTokenizer and AutoModel Class

In [23]:
# Importing the AutoTokenizer and AutoModelForTokenClassification class from transformers library
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

checkpoint = 'xlm-roberta-base'
xlmr_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [24]:
# Testing the tokenzier on a sample data
text = "Tim cook is the CEO of Apple Inc"
xlmr_tokens = xlmr_tokenizer(text,return_tensors='pt')
xlmr_tokens

{'input_ids': tensor([[     0,  13320, 110309,     83,     70,  45733,    111,   4129,   8942,
              2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [25]:
# Testing the tokenizer on a random sentence in train split in english corpus
idx=2
t = xlmr_tokenizer(panx_main['en']['train'][idx]['tokens'],
                   is_split_into_words=True)
t

{'input_ids': [0, 9079, 7113, 202104, 11491, 6, 4, 9079, 7113, 15, 5106, 210298, 1104, 151210, 6, 4, 20271, 30839, 6, 167618, 5106, 1388, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [26]:
xlmr_tokenizer.decode(t['input_ids'])

"<s> Vaivara concentration camp, Vaivara ( ''1943–1944, during German occupation '' )</s>"

In [27]:
t.tokens()

['<s>',
 '▁Vai',
 'vara',
 '▁concentration',
 '▁camp',
 '▁',
 ',',
 '▁Vai',
 'vara',
 '▁(',
 "▁''",
 '1943',
 '–',
 '1944',
 '▁',
 ',',
 '▁during',
 '▁German',
 '▁',
 'occupation',
 "▁''",
 '▁)',
 '</s>']

In [28]:
t.word_ids()

[None, 0, 0, 1, 2, 3, 3, 4, 4, 5, 6, 6, 6, 6, 7, 7, 8, 9, 10, 10, 11, 12, None]

In [29]:
# ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'']

begin2inside = {
    1:2,
    3:4,
    5:6,
}

In [30]:
def align_targets(labels, word_ids):
  aligned_labels = []
  last_word = None
  for word in word_ids:
    if word is None:
      label = -100  # For [CLS] and [SEP] tokens
    elif word != last_word:
      label = labels[word] # For a new word assigning according to index in labels
    else:
      label = labels[word] # Even for the repeated word id the label should be same as before

      # Change the B-Tag to I-Tag after the above step
      if label in begin2inside:
        label = begin2inside[label]

    #add the label
    aligned_labels.append(label)

    #update last word
    last_word = word

  return aligned_labels

In [31]:
# try our function
labels = panx_main['en']['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = align_targets(labels,word_ids)

aligned_targets

[-100, 5, 6, 6, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0, -100]

In [32]:
aligned_labels = [label_names[t] if t >= 0 else None for t in aligned_targets]
for x,y in zip(t.tokens(),aligned_labels):
  print(f'{x}\t{y}')

<s>	None
▁Vai	B-LOC
vara	I-LOC
▁concentration	I-LOC
▁camp	I-LOC
▁	O
,	O
▁Vai	B-LOC
vara	I-LOC
▁(	O
▁''	O
1943	O
–	O
1944	O
▁	O
,	O
▁during	O
▁German	B-ORG
▁	I-ORG
occupation	I-ORG
▁''	O
▁)	O
</s>	None


In [33]:
# tokenize both inputs and targets
# Creating a function to tokenize

def tokenize_fn(batch):
  # tokenize the input sequence first
  # this populates input_ids, attention_mask etc,
  tokenized_inputs = xlmr_tokenizer(batch['tokens'], truncation =True,
                               is_split_into_words = True)

  labels_batch = batch['ner_tags'] # Original targets
  aligned_labels_batch = []
  for i, labels in enumerate(labels_batch):
    word_ids = tokenized_inputs.word_ids(i)
    aligned_labels_batch.append(align_targets(labels, word_ids))

   # recall the 'target' must be stored in the key called 'labels'
  tokenized_inputs['labels'] = aligned_labels_batch

  return tokenized_inputs

In [34]:
panx_en['train'].column_names

['tokens', 'ner_tags', 'langs', 'ner_tags_str']

In [35]:
tokenized_panx_en = panx_en.map(tokenize_fn,
                                batched=True,
                                remove_columns = panx_en['train'].column_names)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [36]:
tokenized_panx_en

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [37]:
tokenized_panx_de = panx_de.map(tokenize_fn,
                                batched=True,
                                remove_columns = panx_de['train'].column_names)

In [38]:
tokenized_panx_fr = panx_fr.map(tokenize_fn,
                                batched=True,
                                remove_columns = panx_fr['train'].column_names)

In [39]:
tokenized_panx_es = panx_es.map(tokenize_fn,
                                batched=True,
                                remove_columns = panx_es['train'].column_names)

In [40]:
tokenized_panx_it = panx_it.map(tokenize_fn,
                                batched=True,
                                remove_columns = panx_it['train'].column_names)

In [41]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=xlmr_tokenizer)

In [42]:
#example
batch = data_collator([tokenized_panx_en['train'][i] for i in range(2)])
batch['labels']


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    3,    4,    4,    0,    0, -100],
        [-100,    3,    4,    0,    1,    2, -100, -100, -100]])

In [43]:
from datasets import load_metric
metric = load_metric('seqeval')

  metric = load_metric('seqeval')


In [44]:
# testing the "seqeval" library
metric.compute(
    predictions=[['O','O','I-ORG','B-PER']],
    references =[['O','B-ORG','I-ORG','B-PER']]
)

{'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.75}

In [45]:
# Creating a function to compute metrics
import numpy as np

def compute_metrics(logits_and_labels):
  logits,labels = logits_and_labels
  preds = np.argmax(logits, axis=-1)

  # remove -100 from labels and predictions
  # and convert the label_ids to label_names

  str_labels = [
      [label_names[t] for t in label if t != -100] for label in labels
  ]

  # do the same for predictions whenever true label is -100
  str_preds = [
      [label_names[p] for p,t in zip(pred,targ) if t!= -100 ] for pred, targ in zip(preds,labels)
  ]

  the_metrics = metric.compute(predictions=str_preds, references=str_labels)
  return {
      'precision': the_metrics['overall_precision'],
      'recall': the_metrics['overall_recall'],
      'f1' : the_metrics['overall_f1'],
      'accuracy': the_metrics['overall_accuracy']
  }


In [46]:
id2label = {k:v for k,v in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}

In [47]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC'}

In [48]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6}

## Experiment-1 : Finetune on English and check zeroshot capabilities in other languages

In [49]:
# Import the XLM-RoBERTa model from the checkpoint

xlmr_model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
from transformers import TrainingArguments

num_epochs = 3
batch_size=16
model_name = f"english-finetuned-experiment-1"
training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01,
                                  disable_tqdm=False,
                                  )

In [51]:
from transformers import Trainer

exp1_trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset = tokenized_panx_en['train'],
    eval_dataset = tokenized_panx_en['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = xlmr_tokenizer,
)

In [52]:
exp1_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5345,0.357929,0.691973,0.749786,0.71972,0.888702
2,0.32,0.307801,0.738352,0.783062,0.76005,0.901847
3,0.2438,0.317112,0.763855,0.7992,0.781128,0.909303


TrainOutput(global_step=1875, training_loss=0.3308401611328125, metrics={'train_runtime': 180.3809, 'train_samples_per_second': 166.315, 'train_steps_per_second': 10.395, 'total_flos': 516819353286720.0, 'train_loss': 0.3308401611328125, 'epoch': 3.0})

In [53]:
en_test_results = exp1_trainer.evaluate(tokenized_panx_en['test'])

In [54]:
en_test_results

{'eval_loss': 0.293182373046875,
 'eval_precision': 0.7723764387271497,
 'eval_recall': 0.8061051441492368,
 'eval_f1': 0.7888804370375494,
 'eval_accuracy': 0.9159936449093076,
 'eval_runtime': 9.3695,
 'eval_samples_per_second': 533.648,
 'eval_steps_per_second': 33.406,
 'epoch': 3.0}

In [55]:
de_test_results = exp1_trainer.evaluate(tokenized_panx_de['test'])
fr_test_results = exp1_trainer.evaluate(tokenized_panx_fr['test'])
es_test_results = exp1_trainer.evaluate(tokenized_panx_es['test'])
it_test_results = exp1_trainer.evaluate(tokenized_panx_it['test'])

In [56]:
de_test_results

{'eval_loss': 0.40376782417297363,
 'eval_precision': 0.6847826086956522,
 'eval_recall': 0.7610738255033557,
 'eval_f1': 0.7209154481881754,
 'eval_accuracy': 0.8984611656367382,
 'eval_runtime': 0.8777,
 'eval_samples_per_second': 569.692,
 'eval_steps_per_second': 36.46,
 'epoch': 3.0}

In [57]:
fr_test_results

{'eval_loss': 0.5701165795326233,
 'eval_precision': 0.7238095238095238,
 'eval_recall': 0.7698986975397974,
 'eval_f1': 0.7461430575035063,
 'eval_accuracy': 0.8402668070914516,
 'eval_runtime': 0.778,
 'eval_samples_per_second': 642.68,
 'eval_steps_per_second': 41.132,
 'epoch': 3.0}

In [58]:
es_test_results

{'eval_loss': 0.577140748500824,
 'eval_precision': 0.655266757865937,
 'eval_recall': 0.7917355371900826,
 'eval_f1': 0.7170658682634731,
 'eval_accuracy': 0.8430423843622992,
 'eval_runtime': 0.7568,
 'eval_samples_per_second': 660.64,
 'eval_steps_per_second': 42.281,
 'epoch': 3.0}

In [59]:
it_test_results

{'eval_loss': 0.4473014175891876,
 'eval_precision': 0.7142857142857143,
 'eval_recall': 0.75920245398773,
 'eval_f1': 0.7360594795539035,
 'eval_accuracy': 0.8824302134646962,
 'eval_runtime': 0.8035,
 'eval_samples_per_second': 622.295,
 'eval_steps_per_second': 39.827,
 'epoch': 3.0}

In [60]:
exp_1_results = pd.DataFrame([en_test_results,
                              de_test_results,
                              fr_test_results,
                              es_test_results,
                              it_test_results],
                             index=['English',
                                    'German',
                                    'French',
                                    'Spanish',
                                    'Italian'])

exp_1_results

Unnamed: 0,eval_loss,eval_precision,eval_recall,eval_f1,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
English,0.293182,0.772376,0.806105,0.78888,0.915994,9.3695,533.648,33.406,3.0
German,0.403768,0.684783,0.761074,0.720915,0.898461,0.8777,569.692,36.46,3.0
French,0.570117,0.72381,0.769899,0.746143,0.840267,0.778,642.68,41.132,3.0
Spanish,0.577141,0.655267,0.791736,0.717066,0.843042,0.7568,660.64,42.281,3.0
Italian,0.447301,0.714286,0.759202,0.736059,0.88243,0.8035,622.295,39.827,3.0


In [61]:
exp_1_results = exp_1_results.drop(columns=['eval_loss',
                                            'eval_runtime',
                                            'eval_samples_per_second',
                                            'eval_steps_per_second',
                                            'epoch'])

exp_1_results.T

Unnamed: 0,English,German,French,Spanish,Italian
eval_precision,0.772376,0.684783,0.72381,0.655267,0.714286
eval_recall,0.806105,0.761074,0.769899,0.791736,0.759202
eval_f1,0.78888,0.720915,0.746143,0.717066,0.736059
eval_accuracy,0.915994,0.898461,0.840267,0.843042,0.88243


## Experiment-2 : Finetune Monolingual models one for each Language

### Finetune and Evaluate on German Langauge

In [62]:
num_epochs = 3
batch_size=16
model_name = f"de-monolingual-finetuned-experiment-2"
training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01,
                                  disable_tqdm=False,
                                  )

de_exp2_trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset = tokenized_panx_de['train'],
    eval_dataset = tokenized_panx_de['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = xlmr_tokenizer,
)


In [63]:
de_exp2_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.248441,0.699874,0.777311,0.736563,0.915927
2,No log,0.259404,0.750327,0.803922,0.7762,0.926083
3,No log,0.284446,0.745124,0.802521,0.772758,0.926083


TrainOutput(global_step=189, training_loss=0.16807116150225282, metrics={'train_runtime': 18.5493, 'train_samples_per_second': 161.731, 'train_steps_per_second': 10.189, 'total_flos': 64800442060464.0, 'train_loss': 0.16807116150225282, 'epoch': 3.0})

In [64]:
de_exp2_test_results = de_exp2_trainer.evaluate(tokenized_panx_de['test'])

In [65]:
de_exp2_test_results

{'eval_loss': 0.26773253083229065,
 'eval_precision': 0.7868421052631579,
 'eval_recall': 0.8026845637583893,
 'eval_f1': 0.7946843853820598,
 'eval_accuracy': 0.9355385920271416,
 'eval_runtime': 0.7782,
 'eval_samples_per_second': 642.512,
 'eval_steps_per_second': 41.121,
 'epoch': 3.0}

### Finetune and Evaluate on French Langauge

In [66]:
num_epochs = 3
batch_size=16
model_name = f"fr-monolingual-finetuned-experiment-2"
training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01,
                                  disable_tqdm=False,
                                  )

fr_exp2_trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset = tokenized_panx_fr['train'],
    eval_dataset = tokenized_panx_fr['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = xlmr_tokenizer,
)


In [67]:
fr_exp2_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.292787,0.771223,0.828439,0.798808,0.916681
2,No log,0.278905,0.756757,0.822257,0.788148,0.91826
3,No log,0.306323,0.79646,0.834621,0.815094,0.918085


TrainOutput(global_step=189, training_loss=0.23975509562820355, metrics={'train_runtime': 17.7695, 'train_samples_per_second': 168.829, 'train_steps_per_second': 10.636, 'total_flos': 47280519126720.0, 'train_loss': 0.23975509562820355, 'epoch': 3.0})

In [68]:
fr_exp2_test_results = fr_exp2_trainer.evaluate(tokenized_panx_fr['test'])

In [69]:
fr_exp2_test_results

{'eval_loss': 0.37674480676651,
 'eval_precision': 0.8286516853932584,
 'eval_recall': 0.85383502170767,
 'eval_f1': 0.8410548823948681,
 'eval_accuracy': 0.9139898192030893,
 'eval_runtime': 1.0972,
 'eval_samples_per_second': 455.696,
 'eval_steps_per_second': 29.165,
 'epoch': 3.0}

### Finetune and Evaluate on Spanish Langauge

In [70]:
num_epochs = 3
batch_size=16
model_name = f"es-monolingual-finetuned-experiment-2"
training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01,
                                  disable_tqdm=False,
                                  )

es_exp2_trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset = tokenized_panx_es['train'],
    eval_dataset = tokenized_panx_es['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = xlmr_tokenizer,
)


In [71]:
es_exp2_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.287004,0.829716,0.852487,0.840948,0.91665
2,No log,0.350603,0.788618,0.831904,0.809683,0.910963
3,No log,0.34892,0.818482,0.850772,0.834315,0.918415


TrainOutput(global_step=189, training_loss=0.18482749171988674, metrics={'train_runtime': 17.327, 'train_samples_per_second': 173.14, 'train_steps_per_second': 10.908, 'total_flos': 39535169836272.0, 'train_loss': 0.18482749171988674, 'epoch': 3.0})

In [72]:
es_exp2_test_results = es_exp2_trainer.evaluate(tokenized_panx_es['test'])

In [73]:
es_exp2_test_results

{'eval_loss': 0.3347281515598297,
 'eval_precision': 0.8161648177496038,
 'eval_recall': 0.8512396694214877,
 'eval_f1': 0.8333333333333334,
 'eval_accuracy': 0.9173601703115928,
 'eval_runtime': 1.0669,
 'eval_samples_per_second': 468.649,
 'eval_steps_per_second': 29.994,
 'epoch': 3.0}

### Finetune and Evaluate on Italian Langauge

In [74]:
from transformers import TrainingArguments, Trainer
num_epochs = 3
batch_size=16
model_name = f"it-monolingual-finetuned-experiment-2"
training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01,
                                  disable_tqdm=False,
                                  )

it_exp2_trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset = tokenized_panx_it['train'],
    eval_dataset = tokenized_panx_it['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = xlmr_tokenizer,
)


In [75]:
it_exp2_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.25572,0.796496,0.835926,0.815735,0.930416
2,No log,0.290514,0.802153,0.842999,0.822069,0.92648
3,No log,0.304713,0.823609,0.858557,0.84072,0.93073


TrainOutput(global_step=189, training_loss=0.18148410131060888, metrics={'train_runtime': 18.1418, 'train_samples_per_second': 165.364, 'train_steps_per_second': 10.418, 'total_flos': 51886082647872.0, 'train_loss': 0.18148410131060888, 'epoch': 3.0})

In [76]:
it_exp2_test_results = it_exp2_trainer.evaluate(tokenized_panx_it['test'])

In [77]:
it_exp2_test_results

{'eval_loss': 0.28023815155029297,
 'eval_precision': 0.8064992614475628,
 'eval_recall': 0.8374233128834356,
 'eval_f1': 0.8216704288939052,
 'eval_accuracy': 0.9343185550082101,
 'eval_runtime': 0.6937,
 'eval_samples_per_second': 720.725,
 'eval_steps_per_second': 46.126,
 'epoch': 3.0}

In [78]:
exp_2_results = pd.DataFrame([en_test_results,
                              de_exp2_test_results,
                              fr_exp2_test_results,
                              es_exp2_test_results,
                              it_exp2_test_results],
                             index=['English',
                                    'German',
                                    'French',
                                    'Spanish',
                                    'Italian'])

exp_2_results = exp_2_results.drop(columns=['eval_loss',
                                            'eval_runtime',
                                            'eval_samples_per_second',
                                            'eval_steps_per_second',
                                            'epoch'])

exp_2_results.T

Unnamed: 0,English,German,French,Spanish,Italian
eval_precision,0.772376,0.786842,0.828652,0.816165,0.806499
eval_recall,0.806105,0.802685,0.853835,0.85124,0.837423
eval_f1,0.78888,0.794684,0.841055,0.833333,0.82167
eval_accuracy,0.915994,0.935539,0.91399,0.91736,0.934319


## Experiment 3

In [50]:
# concatenate all the curated datasets of high and low resource languages

from datasets import concatenate_datasets

def concatenate_splits(corpora):
  multi_corpus = DatasetDict()
  for split in corpora[0].keys():
    multi_corpus[split] = concatenate_datasets([corpus[split] for corpus in corpora]).shuffle(seed=42)
  return multi_corpus


In [51]:
panx_en_de_fr_es_it = concatenate_splits([tokenized_panx_en,
                                          tokenized_panx_de,
                                          tokenized_panx_fr,
                                          tokenized_panx_es,
                                          tokenized_panx_it])

In [52]:
panx_en_de_fr_es_it

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7000
    })
})

In [53]:
from transformers import Trainer,TrainingArguments

In [54]:
num_epochs = 3
batch_size=16
model_name = f"multilingual-finetuned-experiment-3"
training_args = TrainingArguments(output_dir=model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6, weight_decay=0.01,
                                  disable_tqdm=False,
                                  )

multilingual_exp3_trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset = panx_en_de_fr_es_it['train'],
    eval_dataset = panx_en_de_fr_es_it['validation'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = xlmr_tokenizer,
)


In [55]:
multilingual_exp3_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5088,0.310563,0.739903,0.772402,0.755803,0.901529
2,0.2557,0.295478,0.773589,0.807728,0.79029,0.91172
3,0.1707,0.301301,0.787503,0.814669,0.800855,0.917152


TrainOutput(global_step=2625, training_loss=0.28948103623163135, metrics={'train_runtime': 269.7846, 'train_samples_per_second': 155.68, 'train_steps_per_second': 9.73, 'total_flos': 727409561880672.0, 'train_loss': 0.28948103623163135, 'epoch': 3.0})

In [56]:
en_multilingual_test_results = multilingual_exp3_trainer.evaluate(tokenized_panx_en['test'])
de_multilingual_test_results = multilingual_exp3_trainer.evaluate(tokenized_panx_de['test'])
fr_multilingual_test_results = multilingual_exp3_trainer.evaluate(tokenized_panx_fr['test'])
es_multilingual_test_results = multilingual_exp3_trainer.evaluate(tokenized_panx_es['test'])
it_multilingual_test_results = multilingual_exp3_trainer.evaluate(tokenized_panx_it['test'])

In [57]:
multilingual_test_results = pd.DataFrame([en_multilingual_test_results,
                                          de_multilingual_test_results,
                                          fr_multilingual_test_results,
                                          es_multilingual_test_results,
                                          it_multilingual_test_results],
                index=['English','German','French','Spanish','Italian'])

multilingual_test_results = multilingual_test_results.drop(columns=['eval_loss',
                                                                    'eval_runtime',
                                                                    'eval_samples_per_second',
                                                                    'eval_steps_per_second',
                                                                    'epoch'])

multilingual_test_results.T

Unnamed: 0,English,German,French,Spanish,Italian
eval_precision,0.787257,0.805812,0.817923,0.849515,0.835866
eval_recall,0.813737,0.818792,0.832127,0.867769,0.843558
eval_f1,0.800278,0.81225,0.824964,0.858545,0.839695
eval_accuracy,0.917947,0.941233,0.911006,0.924521,0.930378
