In [72]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
# This one is for paraphrasing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [73]:
torch.backends.cuda.matmul.allow_tf32 = True

In [74]:
tokenizer = AutoTokenizer.from_pretrained("allenai/t5-small-squad2-question-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/t5-small-squad2-question-generation")

In [75]:
model.to('cuda');

In [76]:
dataset = load_dataset("csv", data_files='./Question Generation Train.csv');
dataset = dataset['train'];

Downloading and preparing dataset csv/default to C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-447a1f41a36463c4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1001.74it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                        

Dataset csv downloaded and prepared to C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-447a1f41a36463c4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 999.36it/s]


In [77]:
# Shuffle the dataset
dataset = dataset.shuffle();

In [78]:
epochs = 20
max_length = 256
batch_size = 8
gradient_accumulations_steps = 4

In [79]:
dataset_length = len(dataset)

In [80]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=5e-5,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-5
        )

In [81]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            passages = dataset[j:len(dataset)]['passages']
            questions = dataset[j:len(dataset)]['questions']
        else:
            passages = dataset[j:j + batch_size]['passages']
            questions = dataset[j:j + batch_size]['questions']
        j += batch_size 
        
        encoding = tokenizer(
            [original for original in passages],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            questions,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 0.2759139554141319
Epoch 2 Loss/train 0.2334301471710205
Epoch 3 Loss/train 0.2154408962759253
Epoch 4 Loss/train 0.1949857416218274
Epoch 5 Loss/train 0.18489217227452423
Epoch 6 Loss/train 0.17641302371678286
Epoch 7 Loss/train 0.1637298293309669
Epoch 8 Loss/train 0.15758033484628756
Epoch 9 Loss/train 0.1479023192843346
Epoch 10 Loss/train 0.1408114396545985
Epoch 11 Loss/train 0.13821955862110608
Epoch 12 Loss/train 0.13101107065808282
Epoch 13 Loss/train 0.1248355454778018
Epoch 14 Loss/train 0.11895513677433746
Epoch 15 Loss/train 0.11571758189429976
Epoch 16 Loss/train 0.10907639746796595
Epoch 17 Loss/train 0.10892662263079865
Epoch 18 Loss/train 0.10598223768684963
Epoch 19 Loss/train 0.09995983357298864
Epoch 20 Loss/train 0.09285547655739196


In [88]:
# get 2 instructions from the dataset
sentences = ['It was dated to the late Classical to early Hellenistic times.']
sentence_inputs = tokenizer([sentence for sentence in sentences], return_tensors="pt", padding=True)
sentence_inputs = sentence_inputs.to('cuda')

In [89]:
preds = model.generate(
              sentence_inputs['input_ids'],
              do_sample=False, 
              max_length=max_length, 
              num_beams = 8,
              num_beam_groups = 4,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=4
              )

In [90]:
generated_questions = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [91]:
generated_questions = list(set(generated_questions))

In [92]:
generated_questions

['When it was dated to?',
 'When it was dated?',
 'When did the time was it was dat?']

In [87]:
tokenizer.save_pretrained('./FineTunedQuestionGeneration');
model.save_pretrained('./FineTunedQuestionGeneration');