In [1]:
SEED = 1337

checkpoint = "cjvt/t5-sl-small"
#checkpoint = "cjvt/t5-sl-large"

max_len = 512 # num of input/output tokens

## Tokenizer

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer(["test"], return_tensors="pt")#👍

{'input_ids': tensor([[7477,    1]]), 'attention_mask': tensor([[1, 1]])}

## Dataset

In [4]:
import torch
from torch.utils.data import Dataset, random_split
import pandas as pd

class ParaDataset(Dataset):

  def __init__(self, fpath, tokenizer, prefix, max_len=512):
    super().__init__()
    self.raw_data = self._load(fpath)
    self.tokenizer = tokenizer
    self.prefix = prefix
    self.max_len = max_len

    self.inputs, self.targets = self._preprocess()

  def __len__(self):
    return len(self.raw_data)

  def __getitem__(self, index):
    out = {k:v[index] for k,v in self.inputs.items()}
    out["labels"] = self.targets.input_ids[index]
    return out

  def _load(self, fpath):
    return pd.read_csv(fpath, sep="\t", names=["paragraph", "paraphrase"])

  def _preprocess(self):
    return self._tokenize(self.raw_data.paragraph), self._tokenize(self.raw_data.paraphrase, prefix=False)

  def _tokenize(self, text_list, prefix=True):
    return self.tokenizer(
        [self.prefix + text if prefix else text for text in text_list],
        truncation=True, padding="max_length", 
        max_length=self.max_len, return_tensors="pt"
        )

In [5]:
dataset_path = "../../../data/backtranslate/backtranslate.csv"

data = pd.read_csv(dataset_path, sep="\t", names=["inputs", "targets"])
data

Unnamed: 0,inputs,targets
0,"Amsterdam - Le nekaj mesecev potem, ko so nizo...","Amsterdam - Le nekaj mesecev po tem, ko so niz..."
1,"""S trenerjem sva načrtovala uvrstitev v najbol...","""S trenerjem sva načrtovala uvrstitev med najb..."
2,"Najprej zato, ker znajo gledalcem, ki se jih j...","Najprej zato, ker znajo gledalcem ponuditi, ki..."
3,Izidi: 1. kolo - skupina A: ZRJ - Grčija 83:72...,: Rezultati 1 kolo - skupina A: FRY - Grčija 8...
4,Tekmovanje se bo pravzaprav začelo že danes z ...,Tekmovanje se bo pravzaprav začelo danes z ura...
...,...,...
11306,"Bistvo vsega ni naše telo, temveč telo tehnolo...","Bistvo vsega ni naše telo, ampak telo tehnolog..."
11307,Crowley je bil tudi sam umetnik. Za njim je os...,"Crowley sam je bil umetnik, ki je zapustil pre..."
11308,"Vsi, ki jih ""prerok Horusovega eona"" tako ali ...","Vsi, ki so na tak ali drugačen način zgleduje ..."
11309,"Lib Demi, ki so obvladovali britansko političn...","Lib Demi, ki je prevladovala na britanski poli..."


In [6]:
paraset = ParaDataset(dataset_path, tokenizer, "parafraziraj: ", max_len)

gen = torch.Generator().manual_seed(SEED)
train_set, val_set = random_split(paraset, [0.9, 0.1], generator=gen)

## Training

In [7]:
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir= "./t5",
    overwrite_output_dir=True,
    save_strategy="epoch",
    evaluation_strategy = "epoch",
    num_train_epochs=10,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    #weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    seed=SEED
)

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_set,
    eval_dataset=val_set
)

trainer.train()

  0%|          | 0/101800 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/101800 [00:00<13:46:40,  2.05it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 2.94 GiB total capacity; 1.55 GiB already allocated; 61.12 MiB free; 1.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF