# XLM-RoBERTa and XLM-V finetuning using Massive Dataset

Please note that this notebook has been prepared to work on Google Colab. If you want to run it locally, you will need to make some changes.

## Basic preparation

In [None]:
from google.colab import drive

drive.mount('/content/drive')
PATH_PREFIX = '/content/drive/My Drive/nlp/'

In [None]:
!git clone https://github.com/Tsilkow/NLP-group-project.git
!pip install -r NLP-group-project/requirements.txt
!copy NLP-group-project/src/* .

In [None]:
!sh setup_dataset.sh

In [None]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np
from datasets import MassiveDatasetXLMR, MassiveDatasetXLMV

In [None]:
def compute_metrics(eval_pred):
  metric = evaluate.load('accuracy')
  logits, labels = eval_pred

  predictions = np.argmax(logits, axis=1)
  labels = np.argmax(labels, axis=1)

  return metric.compute(predictions=predictions, references=labels)

def new_train_environment(train_dataset, test_dataset, train_args, model_name):
  model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=NUM_LABELS
  )

  trainer = Trainer(
      model=model,
      args=train_args,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics
  )

  return model, trainer

## Dataset prepraration

In [None]:
with open(PATH_PREFIX + 'data/labels.json', 'r') as file:
    labels = json.load(file)

In [None]:
NUM_LABELS = len(labels)

## XLM-RoBERTa-base finetuning
This section contains all code used for finetuning `xlm-roberta-base` model.
We have performed some experiments with different learning rates and we have found that the best results are obtained with $10^{-5}$ learning rate.



### Preparing the environment

In [None]:
roberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)

In [None]:
train_dataset_r_pl = MassiveDatasetXLMR(PATH_PREFIX + 'data/pl-PL/train.json', roberta_tokenizer)
test_dataset_r_pl = MassiveDatasetXLMR(PATH_PREFIX + 'data/pl-PL/test.json', roberta_tokenizer)
val_dataset_r_pl = MassiveDatasetXLMR(PATH_PREFIX + 'data/pl-PL/val.json', roberta_tokenizer)

train_dataset_r_all = MassiveDatasetXLM(PATH_PREFIX + 'data/combined/train.json', tokenizer)
test_dataset_r_all = MassiveDatasetXLM(PATH_PREFIX + 'data/combined/test.json', tokenizer)
val_dataset_r_all = MassiveDatasetXLM(PATH_PREFIX + 'data/combined/val.json', tokenizer)

### Finetuning - Polish language

In [None]:
train_args_lr5e5 = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-roberta-lr5e5/",
    logging_dir=PATH_PREFIX + ".log/xlm-roberta-lr5e5/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=5e-5,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True)

model_lr5e5, trainer_lr5e5 = new_train_environment(train_dataset_r_pl, val_dataset_r_pl, train_args_lr5e5, 'xlm-roberta-base')

In [None]:
trainer_lr5e5.train()

In [None]:
trainer_lr5e5.evaluate(test_dataset_r_pl)

In [None]:
train_args_lr1e4 = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-roberta-lr1e4/",
    logging_dir=PATH_PREFIX + ".log/xlm-roberta-lr1e4/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=1e-4,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True)

model_lr1e4, trainer_lr1e4 = new_train_environment(train_dataset_r_pl, val_dataset_r_pl, train_args_lr1e4, 'xlm-roberta-base')

In [None]:
trainer_lr1e4.train()

In [None]:
trainer_lr1e4.evaluate(test_dataset_r_pl)

In [None]:
train_args_lr1e5 = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-roberta-lr1e5/",
    logging_dir=PATH_PREFIX + ".log/xlm-roberta-lr1e5/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=1e-5,

    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True)

model_lr1e5, trainer_lr1e5 = new_train_environment(train_dataset_r_pl, val_dataset_r_pl, train_args_lr1e5, 'xlm-roberta-base')

In [None]:
trainer_lr1e5.train()

In [None]:
trainer_lr1e5.evaluate(test_dataset_r_pl)

### Finetuning - combined languages

In [None]:
train_args_lr5e5_all = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-roberta-lr5e5-combined/",
    logging_dir=PATH_PREFIX + ".log/xlm-roberta-lr5e5-combined/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=5e-5,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True)

model_lr5e5_all, trainer_lr5e5_all = new_train_environment(train_dataset_r_all, val_dataset_r_all, train_args_lr5e5_all, 'xlm-roberta-base')

In [None]:
trainer_lr5e5_all.train()

In [None]:
trainer_lr5e5_all.evaluate(test_dataset_r_all)

## XLM-V-base finetuning

This section contains all code used for finetuning `xlm-v-base` model. As this model is bigger than `xlm-roberta-base`, we have performed less tests than previously due to computational resources limitations.

### Preparing the environment

In [None]:
v_tokenizer = AutoTokenizer.from_pretrained('facebook/xlm-v-base', do_lower_case=True)

train_dataset_v_pl = MassiveDatasetXLMV(PATH_PREFIX + 'data/pl-PL/train.json', v_tokenizer)
test_dataset_v_pl = MassiveDatasetXLMV(PATH_PREFIX + 'data/pl-PL/test.json', v_tokenizer)
val_dataset_v_pl = MassiveDatasetXLMV(PATH_PREFIX + 'data/pl-PL/val.json', v_tokenizer)

train_dataset_v_all = MassiveDatasetXLMV(PATH_PREFIX + 'data/combined/train.json', tokenizer)
test_dataset_v_all = MassiveDatasetXLMV(PATH_PREFIX + 'data/combined/test.json', tokenizer)
val_dataset_v_all = MassiveDatasetXLMV(PATH_PREFIX + 'data/combined/val.json', tokenizer)

### Finetuning - Polish language

In [None]:
train_args_v_lr5e5 = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-v-lr5e5/",
    logging_dir=PATH_PREFIX + ".log/xlm-v-lr5e5/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=5e-5,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    load_best_model_at_end=True)

model_v_lr5e5, trainer_v_lr5e5 = new_train_environment(train_dataset_v_pl, val_dataset_v_pl, train_args_v_lr5e5, 'facebook/xlm-v-base')

In [None]:
trainer_v_lr5e5.train()

In [None]:
trainer_v_lr5e5.evaluate(test_dataset_v_pl)

In [None]:
trainer_v_lr5e5.evaluate(test_dataset_v_pl)

In [None]:
train_args_v_lr1e4 = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-v-lr1e4/",
    logging_dir=PATH_PREFIX + ".log/xlm-v-lr1e4/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=1e-4,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    load_best_model_at_end=True)

model_v_lr1e4, trainer_v_lr1e4 = new_train_environment(train_dataset_v_pl, val_dataset_v_pl, train_args_v_lr1e4, 'facebook/xlm-v-base')

In [None]:
trainer_v_lr1e4.train()

In [None]:
trainer_v_lr1e4.evaluate(test_dataset_v_pl)

### Finetuning - combined languages

In [None]:
train_args_v_lr5e5_all = TrainingArguments(
    output_dir=PATH_PREFIX + ".out/xlm-v-lr5e5-combined/",
    logging_dir=PATH_PREFIX + ".log/xlm-v-lr5e5-combined/",
    logging_strategy='epoch',
    num_train_epochs=10,
    learning_rate=5e-5,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    load_best_model_at_end=True)

model_v_lr5e5_all, trainer_v_lr5e5_all = new_train_environment(train_dataset_v_all, val_dataset_v_all, train_args_v_lr5e5_all, 'facebook/xlm-v-base')

In [None]:
trainer_v_lr5e5_all.train()

In [None]:
trainer_v_lr5e5.evaluate(test_dataset_v_all)