# XLM finetuning using Massive Dataset

## Basic preparation

In [1]:
from google.colab import drive

drive.mount('/content/drive')
PATH_PREFIX = '/content/drive/My Drive/nlp/NLP-group-project'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
!cd /content/drive/My\ Drive/nlp/NLP-group-project && pip install -r requirements.txt
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!cd /content/drive/My\ Drive/nlp/NLP-group-project && sh setup_dataset.sh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
1.0/data/pl-PL.jsonl
1.0/data/ru-RU.jsonl
1.0/data/sl-SL.jsonl
Dataset files found: ['dataset/pl-PL.jsonl', 'dataset/ru-RU.jsonl', 'dataset/sl-SL.jsonl']
11514 entries in pl-PL/train.json
2033 entries in pl-PL/val.json
2974 entries in pl-PL/test.json
11514 entries in ru-RU/train.json
2033 entries in ru-RU/val.json
2974 entries in ru-RU/test.json
11514 entries in sl-SL/train.json
2033 entries in sl-SL/val.json
2974 entries in sl-SL/test.json
60 entries in labels.json


In [17]:
import json
import torch
from torch.utils.data import Dataset
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
import numpy as np

In [58]:
class MassiveDatasetXLM(Dataset):
    def __init__(
        self,
        json_path: str,
        tokenizer: XLMRobertaTokenizer,
        labels_values_path = PATH_PREFIX + '/data/labels.json',
    ):
        self._tokenizer = tokenizer
        self._inputs = None
        self._targets = None

        with open(labels_values_path, 'r') as file:
            self.labels_values = json.load(file)
        self.idx_to_label = {i: label for i, label in enumerate(self.labels_values)}
        self.label_to_idx = {label: i for i, label in enumerate(self.labels_values)}

        with open(json_path, 'r') as file:
            data = json.load(file)
        self._encode(data)

    def __len__(self) -> int:
        return len(self._inputs['input_ids'])

    def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
        item = {
          'input_ids': self._inputs['input_ids'][index],
          'attention_mask': self._inputs['attention_mask'][index],
          'labels': self._targets[index]
        }
        return item

    def _one_hot_encode_labels(self, labels: list[str]) -> torch.Tensor:
        encoded_labels = torch.zeros((len(labels), len(self.labels_values)))
        label_indices = [self.label_to_idx[label] for label in labels]
        encoded_labels[torch.arange(len(labels)), label_indices] = 1.0
        return encoded_labels

    def _decode_one_hot_labels(self, encoded_labels: torch.Tensor) -> list[str]:
        """
        :param encoded_labels: Two dimensional tensor where each row should contain single
        non zero value.
        """
        labels = [self.idx_to_label[torch.argmax(enc_label)] for enc_label in encoded_labels]
        return labels

    def _encode(self, data: dict):
        """
        Encode inputs with tokenizer and outputs into one-hot format.
        """
        self._inputs = self._tokenizer(data['x'], padding='longest', return_tensors='pt')
        self._targets = self._one_hot_encode_labels(data['y'])

In [5]:
with open(PATH_PREFIX + '/data/labels.json', 'r') as file:
    labels = json.load(file)

In [6]:
NUM_LABELS = len(labels)

In [46]:
TRAINING_ARGS = TrainingArguments(
    output_dir=PATH_PREFIX + "/.out/xlm/",
    logging_dir=PATH_PREFIX + "/.log/xlm/",
    logging_strategy='steps',
    num_train_epochs=3,
    learning_rate=5e-6,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True)

## XLM Roberta base
In this section I will

In [8]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)

In [59]:
train_dataset_pl = MassiveDatasetXLM(PATH_PREFIX + '/data/pl-PL/train.json', tokenizer)
test_dataset_pl = MassiveDatasetXLM(PATH_PREFIX + '/data/pl-PL/test.json', tokenizer)
val_dataset_pl = MassiveDatasetXLM(PATH_PREFIX + '/data/pl-PL/val.json', tokenizer)

In [63]:
def compute_metrics(eval_pred):
  metric = evaluate.load('accuracy')
  logits, labels = eval_pred

  predictions = np.argmax(logits, axis=1)
  labels = np.argmax(labels, axis=1)

  return metric.compute(predictions=predictions, references=labels)

def new_environment(train_dataset, test_dataset):
  model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=NUM_LABELS
  )

  trainer = Trainer(
      model=model,
      args=TRAINING_ARGS,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics
  )

  return model, trainer

In [64]:
model, trainer = new_environment(train_dataset_pl, test_dataset_pl)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.d

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
