In [7]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
# This one is for paraphrasing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [8]:
torch.backends.cuda.matmul.allow_tf32 = True

In [9]:
tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
model = AutoModelForSeq2SeqLM.from_pretrained("prithivida/parrot_paraphraser_on_T5")

In [10]:
model.to('cuda');

In [16]:
dataset = load_dataset("csv", data_files='./Paraphraser Train.csv')
dataset = dataset['train']

Downloading and preparing dataset csv/default to C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-4468969375aafa41/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 499.68it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-4468969375aafa41/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 500.51it/s]


In [17]:
epochs = 2
max_length = 256
task_prefix = "paraphrase: "
batch_size = 5
gradient_accumulations_steps = 4

In [18]:
dataset_length = len(dataset)

In [19]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=5e-4,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-5
        )

In [20]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            originals = dataset[j:len(dataset)]['original']
            paraphrases = dataset[j:len(dataset)]['paraphrase']
        else:
            originals = dataset[j:j + batch_size]['original']
            paraphrases = dataset[j:j + batch_size]['paraphrase']
        j += batch_size 
        
        encoding = tokenizer(
            [task_prefix + original for original in originals],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            paraphrases,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 3.4348686933517456
Epoch 2 Loss/train 1.6255252659320831


In [29]:
# get 2 instructions from the dataset
questions = ['How to maintain marble with cracks?', 'How to remove rust stains from corroded connectors?', 'Will the project be finished on time and everything will be ok']
question_inputs = tokenizer([task_prefix + question for question in questions], return_tensors="pt", padding=True)
question_inputs = question_inputs.to('cuda')

In [32]:
preds = model.generate(
              question_inputs['input_ids'],
              do_sample=False, 
              max_length=max_length, 
              num_beams = 10,
              num_beam_groups = 2,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=3
              )

In [33]:
predicted_answers = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [34]:
predicted_answers

['How to maintain marble with cracks?',
 'How to maintain the marble with cracks?',
 'How to maintain a marble with cracks?',
 'How to remove rust stains from corroded connectors?',
 'How to clean rust stains from corroded connectors?',
 'How to clear rust stains from corroded connectors?',
 'Will the project be completed on time and everything will be okay',
 'Will the project be finished on time and everything will be ok?',
 'Will the project be completed on time and everything will be ok']