In [1]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
# This one is for paraphrasing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("allenai/t5-small-squad2-question-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/t5-small-squad2-question-generation")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 325523c6-545d-43ae-bfba-5a1dc366c414)')' thrown while requesting HEAD https://huggingface.co/allenai/t5-small-squad2-question-generation/resolve/main/tokenizer_config.json


In [None]:
model.to('cuda');

In [None]:
dataset = load_dataset("csv", data_files='./Question Generation Train.csv');
dataset = dataset['train'];

Found cached dataset csv (C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-1689b4499577ff6c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]


In [None]:
# Shuffle the dataset
dataset = dataset.shuffle();

In [None]:
dataset

Dataset({
    features: ['passages', 'questions'],
    num_rows: 702
})

In [None]:
epochs = 20
max_length = 524
batch_size = 64

In [None]:
dataset_length = len(dataset)

In [68]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=2e-5,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-6
        )

In [69]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            passages = dataset[j:len(dataset)]['passages']
            questions = dataset[j:len(dataset)]['questions']
        else:
            passages = dataset[j:j + batch_size]['passages']
            questions = dataset[j:j + batch_size]['questions']
        j += batch_size 
        
        encoding = tokenizer(
            [original for original in passages],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            questions,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 0.05065114686915602
Epoch 2 Loss/train 0.05005071379921653
Epoch 3 Loss/train 0.049397751033480916
Epoch 4 Loss/train 0.04836491545624048
Epoch 5 Loss/train 0.04803230755489942
Epoch 6 Loss/train 0.04733248609951165
Epoch 7 Loss/train 0.04677258838306774
Epoch 8 Loss/train 0.04634768382545091
Epoch 9 Loss/train 0.04612297046918673
Epoch 10 Loss/train 0.04568388455074903


In [90]:
# get 2 instructions from the dataset
sentences = [
    'The most valuable findings there were dated from archaic times',
    'This building was probably used as the market of Hellenistic times'
    ]
sentence_inputs = tokenizer([sentence for sentence in sentences], return_tensors="pt", padding=True)
sentence_inputs = sentence_inputs.to('cuda')

In [91]:
preds = model.generate(
              sentence_inputs['input_ids'],
              do_sample=False, 
              max_length=max_length, 
              num_beams = 16,
              num_beam_groups = 4,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=4
              )

In [92]:
generated_questions = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [93]:
generated_questions = list(set(generated_questions))

In [94]:
generated_questions

['The most valuable findings were dated from what time?',
 'The most valuable findings were dated from what?',
 'What building was used as a market in Hellenistic times?',
 'What building was used as a market for Hellenistic time?',
 'What building was used as a market for Hellenistic times?',
 'What were the most valuable findings there?',
 'What building was used to be used as a market?',
 'The most valuable findings were dated from what times?']

In [75]:
tokenizer.save_pretrained('./FineTunedQuestionGeneration');
model.save_pretrained('./FineTunedQuestionGeneration');