In [None]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

PATH = '/content/gdrive/MyDrive/NLP-group-project'
if not os.path.exists(PATH):
    %mkdir -p $PATH

%cd $PATH

In [None]:
!pip install -q --upgrade -r requirements.txt

In [None]:
import os
import json
import torch
import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data.dataloader import DataLoader
from transformers.modeling_outputs import Seq2SeqLMOutput
from transformers import T5Tokenizer, T5ForConditionalGeneration, \
    MT5Tokenizer, MT5ForConditionalGeneration
from sklearn.metrics import accuracy_score

from src.dataset import MassiveDatasetT5

In [None]:
class ClassifierT5(pl.LightningModule):
    def __init__(self, tokenizer, model, lr, weight_decay):
        super(ClassifierT5, self).__init__()
        self._tokenizer = tokenizer
        self._model = model
        self._lr = lr
        self._weight_decay = weight_decay

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self._model.parameters(),
            lr=self._lr, weight_decay=self._weight_decay
        )

    def forward(
        self,
        input_ids,
        attention_mask=None,
        decoder_attention_mask=None,
        labels=None
    ) -> Seq2SeqLMOutput:
        return self._model(
            input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
            return_dict=True
        )

    def _log_metrics(self, metrics, mode):
        for metric_name, metric_value in metrics.items():
            self.log(
                mode + '_' + metric_name,
                metric_value,
                on_step=True, on_epoch=True, prog_bar=True
            )

    def _step(self, batch):
        inputs, targets = batch
        labels = targets['ids'].clone()
        labels[labels == self._tokenizer.pad_token_id] = -100
        output = self.forward(
            inputs['ids'],
            attention_mask=inputs['attention_mask'],
            decoder_attention_mask=targets['attention_mask'],
            labels=labels
        )

        predicted_labels = self.predict_step(batch)
        accuracy = self._compute_accuracy(batch, predicted_labels)

        return {'loss': output.loss, 'accuracy': accuracy}

    def training_step(self, batch, batch_idx):
        metrics = self._step(batch)
        self._log_metrics(metrics, 'train')
        return metrics

    def validation_step(self, batch, batch_idx):
        metrics = self._step(batch)
        self._log_metrics(metrics, 'val')
        return metrics

    def test_step(self, batch, batch_idx):
        metrics = self._step(batch)
        self._log_metrics(metrics, 'test')
        return metrics

    def predict_step(self, batch, batch_idx=None):
        inputs = batch[0]
        output = self._model.generate(
            inputs['ids'],
            attention_mask=inputs['attention_mask'],
            do_sample=False,
            max_length=MAX_LABEL_LENGTH  # Length of longest label
        )
        return self._tokenizer.batch_decode(output, skip_special_tokens=True)

    def _compute_accuracy(self, batch, predicted_labels):
        targets = batch[1]
        target_labels = self._tokenizer.batch_decode(targets['ids'], skip_special_tokens=True)
        accuracy = accuracy_score(y_true=target_labels, y_pred=predicted_labels)
        return accuracy

In [None]:
tokenizer_t5 = T5Tokenizer.from_pretrained('t5-base', model_max_length=256)
model_t5 = T5ForConditionalGeneration.from_pretrained('t5-base')
# tokenizer_t5 = MT5Tokenizer.from_pretrained('google/mt5-small')
# model_t5 = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')
# tokenizer_t5 = MT5Tokenizer.from_pretrained('google/mt5-base')
# model_t5 = MT5ForConditionalGeneration.from_pretrained('google/mt5-base')

LANGUAGE = 'pl-PL'
# LANGUAGE = 'combined'
train_path = os.path.join('data', LANGUAGE, 'train.json')
val_path = os.path.join('data', LANGUAGE, 'val.json')
test_path = os.path.join('data', LANGUAGE, 'test.json')

train_dataset = MassiveDatasetT5(train_path, tokenizer_t5)
val_dataset = MassiveDatasetT5(val_path, tokenizer_t5)
test_dataset = MassiveDatasetT5(test_path, tokenizer_t5)

In [None]:
with open('data/labels.json', 'r') as file:
    labels_values = json.load(file)

output = tokenizer_t5(labels_values, padding='longest', return_tensors='pt')
MAX_LABEL_LENGTH = output['input_ids'].shape[1]

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs

In [None]:
EPOCHS_NUM = 5
LEARNING_RATE = 3e-4
WEIGHT_DECAY = 1e-2
TRAIN_BATCH_SIZE = 50
TEST_BATCH_SIZE = 100

dataloader_kwargs = {'num_workers': 2, 'pin_memory': True}

train_kwargs = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, **dataloader_kwargs}
test_kwargs = {'batch_size': TEST_BATCH_SIZE, 'shuffle': False, **dataloader_kwargs}

train_loader = DataLoader(train_dataset, **train_kwargs)
val_loader = DataLoader(val_dataset, **test_kwargs)
test_loader = DataLoader(test_dataset, **test_kwargs)

In [None]:
checkpoint_callback = ModelCheckpoint(monitor='val_accuracy_epoch', mode='max')
trainer = pl.Trainer(
    log_every_n_steps=10,
    max_epochs=EPOCHS_NUM,
    callbacks=[checkpoint_callback]
)
classifierT5 = ClassifierT5(
    tokenizer=tokenizer_t5,
    model=model_t5,
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY
)
trainer.fit(classifierT5, train_loader, val_loader)

In [None]:
trainer.test(classifierT5, test_loader)