In [None]:
from google.colab import drive

drive.mount('/content/drive')
PATH_PREFIX = '/content/drive/My Drive/NLP-group-project'

In [None]:
!cd /content/drive/My\ Drive/NLP-group-project && pip install -r requirements.txt

In [None]:
!cd /content/drive/My\ Drive/NLP-group-project && sh setup_dataset.sh

In [None]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
import numpy as np

In [None]:
class MassiveDatasetHerbert(Dataset):
    def __init__(
        self,
        json_path: str,
        tokenizer: AutoTokenizer.from_pretrained('allegro/herbert-base-cased'),
        labels_values_path = PATH_PREFIX + '/data/labels.json',
    ):
        self._tokenizer = tokenizer
        self._inputs = None
        self._targets = None

        with open(labels_values_path, 'r') as file:
            self.labels_values = json.load(file)
        self.idx_to_label = {i: label for i, label in enumerate(self.labels_values)}
        self.label_to_idx = {label: i for i, label in enumerate(self.labels_values)}

        with open(json_path, 'r') as file:
            data = json.load(file)
        self._encode(data)

    def __len__(self) -> int:
        return len(self._inputs['input_ids'])

    def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
        item = {
          'input_ids': self._inputs['input_ids'][index],
          'attention_mask': self._inputs['attention_mask'][index],
          'labels': self._targets[index]
        }
        return item

    def _one_hot_encode_labels(self, labels: list[str]) -> torch.Tensor:
        encoded_labels = torch.zeros((len(labels), len(self.labels_values)))
        label_indices = [self.label_to_idx[label] for label in labels]
        encoded_labels[torch.arange(len(labels)), label_indices] = 1.0
        return encoded_labels

    def _decode_one_hot_labels(self, encoded_labels: torch.Tensor) -> list[str]:
        """
        :param encoded_labels: Two dimensional tensor where each row should contain single
        non zero value.
        """
        labels = [self.idx_to_label[torch.argmax(enc_label)] for enc_label in encoded_labels]
        return labels

    def _encode(self, data: dict):
        """
        Encode inputs with tokenizer and outputs into one-hot format.
        """
        self._inputs = self._tokenizer(data['x'], padding='longest', return_tensors='pt')
        self._targets = self._one_hot_encode_labels(data['y'])




In [None]:
with open(PATH_PREFIX + '/data/labels.json', 'r') as file:
    labels = json.load(file)

In [None]:
NUM_LABELS = len(labels)

In [None]:
TRAINING_ARGS = TrainingArguments(
    output_dir=PATH_PREFIX + "/.out/herbert/",
    logging_dir=PATH_PREFIX + "/.log/herbert/",
    logging_strategy='steps',
    num_train_epochs=6,
    learning_rate=4.5e-5,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True)

In [None]:
def compute_metrics(eval_pred):
  metric = evaluate.load('accuracy')
  logits, labels = eval_pred

  predictions = np.argmax(logits, axis=1)
  labels = np.argmax(labels, axis=1)

  return metric.compute(predictions=predictions, references=labels)


Herbert Base

In [None]:
tokenizer = AutoTokenizer.from_pretrained('allegro/herbert-base-cased')

train_dataset_pl = MassiveDatasetHerbert(PATH_PREFIX + '/data/pl-PL/train.json', tokenizer)
test_dataset_pl = MassiveDatasetHerbert(PATH_PREFIX + '/data/pl-PL/test.json', tokenizer)
val_dataset_pl = MassiveDatasetHerbert(PATH_PREFIX + '/data/pl-PL/val.json', tokenizer)

def new_environment(train_dataset, test_dataset):
  model = AutoModelForSequenceClassification.from_pretrained(
    "allegro/herbert-base-cased",
    num_labels=NUM_LABELS
  )

  trainer = Trainer(
      model=model,
      args=TRAINING_ARGS,
      train_dataset=train_dataset,
      eval_dataset=test_dataset,
      compute_metrics=compute_metrics
  )

  return model, trainer

In [None]:
model, trainer = new_environment(train_dataset_pl, test_dataset_pl)
trainer.train()