In [294]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
# This one is for paraphrasing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [295]:
torch.backends.cuda.matmul.allow_tf32 = True

In [296]:
tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/parrot_paraphraser_on_T5")

In [297]:
model.to('cuda');

In [298]:
dataset = load_dataset("csv", data_files='./Paraphraser Train.csv');
dataset = dataset['train'];

Found cached dataset csv (C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-ee69a41b8fb3ac2d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 999.60it/s]


In [299]:
# Shuffle the dataset
dataset = dataset.shuffle();

In [303]:
epochs = 3
max_length = 256
task_prefix = "paraphrase: "
batch_size = 5
gradient_accumulations_steps = 4

In [304]:
dataset_length = len(dataset)

In [305]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=1e-5,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-5
        )

In [312]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            originals = dataset[j:len(dataset)]['original']
            paraphrases = dataset[j:len(dataset)]['paraphrase']
        else:
            originals = dataset[j:j + batch_size]['original']
            paraphrases = dataset[j:j + batch_size]['paraphrase']
        j += batch_size 
        
        encoding = tokenizer(
            [task_prefix + original for original in originals],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            paraphrases,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 0.2979625934048703
Epoch 2 Loss/train 0.28260450195847897
Epoch 3 Loss/train 0.277863227484519


In [315]:
# get 2 instructions from the dataset
questions = ['The ancient temple of Kythnos is one of the most important monuments of the island.']
question_inputs = tokenizer([task_prefix + question for question in questions], return_tensors="pt", padding=True)
question_inputs = question_inputs.to('cuda')

In [316]:
preds = model.generate(
              question_inputs['input_ids'],
              do_sample=False, 
              max_length=max_length, 
              num_beams = 10,
              num_beam_groups = 5,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=5
              )

In [317]:
predicted_answers = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [319]:
predicted_answers = list(set(predicted_answers))

In [320]:
predicted_answers

['The ancient temple of Kythnos is one of the most important monuments on the island.',
 "Kythnos' ancient temple is one of the most important attractions of the island.",
 "Kythnos' ancient temple is one of the most important monuments of the island.",
 'The ancient temple of Kythnos is one of the most important monuments of the island.',
 'Ancient temple of Kythnos is one of the most important monuments of the island']

In [311]:
model.save_pretrained('./FineTunedParrotParaphraser')