In [1]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
# This one is for paraphrasing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/parrot_paraphraser_on_T5")

In [4]:
model.to('cuda');

In [5]:
dataset = load_dataset("csv", data_files='./Paraphraser Train.csv');
dataset = dataset['train'];

Downloading and preparing dataset csv/default to C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-10aa3212afa77684/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 998.17it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 333.36it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-10aa3212afa77684/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 250.09it/s]


In [6]:
# Shuffle the dataset
dataset = dataset.shuffle();

In [7]:
epochs = 10
max_length = 256
task_prefix = "Paraphrase: "
batch_size = 4
gradient_accumulations_steps = 4

In [8]:
dataset_length = len(dataset)

In [9]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=1e-5,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-5
        )

In [10]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            originals = dataset[j:len(dataset)]['original']
            paraphrases = dataset[j:len(dataset)]['paraphrase']
        else:
            originals = dataset[j:j + batch_size]['original']
            paraphrases = dataset[j:j + batch_size]['paraphrase']
        j += batch_size 
        
        encoding = tokenizer(
            [task_prefix + original for original in originals],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            paraphrases,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 0.40518665611743926
Epoch 2 Loss/train 0.3726112134754658
Epoch 3 Loss/train 0.34450956135988237
Epoch 4 Loss/train 0.32936406061053275
Epoch 5 Loss/train 0.3203792110085487
Epoch 6 Loss/train 0.309032890945673
Epoch 7 Loss/train 0.2965058274567127
Epoch 8 Loss/train 0.2886416710913181
Epoch 9 Loss/train 0.27944999262690545
Epoch 10 Loss/train 0.2702216446399689


In [11]:
# get 2 instructions from the dataset
sentences = ['The ancient temple is the most important structure of the old town.']
sentence_inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
sentence_inputs = sentence_inputs.to('cuda')

In [12]:
preds = model.generate(
              sentence_inputs['input_ids'],
              do_sample=False, 
              max_length=max_length, 
              num_beams = 10,
              num_beam_groups = 5,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=5
              )

In [13]:
predicted_answers = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [14]:
predicted_answers = list(set(predicted_answers))

In [15]:
predicted_answers

['Ancient temple is the most important building of the old town',
 'Ancient temple is the most important building in the old town',
 'The most important structure of the old town is an ancient temple',
 'The most important structure of the old town is the ancient temple',
 'The ancient temple is the most important structure of the old town']

In [16]:
tokenizer.save_pretrained('./FineTunedParrotParaphraser');
model.save_pretrained('./FineTunedParrotParaphraser');