In [1]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
# This one is for paraphrasing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("allenai/t5-small-squad2-question-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/t5-small-squad2-question-generation")

In [4]:
model.to('cuda');

In [5]:
dataset = load_dataset("csv", data_files='./Question Generation Train.csv');
dataset = dataset['train'];

Found cached dataset csv (C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-d7f5d0f0e208591f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 999.12it/s]


In [6]:
# Shuffle the dataset
dataset = dataset.shuffle();

In [7]:
epochs = 10
max_length = 256
batch_size = 8
gradient_accumulations_steps = 4

In [8]:
dataset_length = len(dataset)

In [9]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=1e-6,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-5
        )

In [10]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            passages = dataset[j:len(dataset)]['passages']
            questions = dataset[j:len(dataset)]['questions']
        else:
            passages = dataset[j:j + batch_size]['passages']
            questions = dataset[j:j + batch_size]['questions']
        j += batch_size 
        
        encoding = tokenizer(
            [original for original in passages],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            questions,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 0.2889590206600371
Epoch 2 Loss/train 0.2902953757180108
Epoch 3 Loss/train 0.2819773195281861
Epoch 4 Loss/train 0.2791683276494344
Epoch 5 Loss/train 0.28172646628485787
Epoch 6 Loss/train 0.28290758341077776
Epoch 7 Loss/train 0.2774313035465422
Epoch 8 Loss/train 0.2747422721650865
Epoch 9 Loss/train 0.28128238519032794
Epoch 10 Loss/train 0.27715115320114864


In [22]:
# get 2 instructions from the dataset
sentences = ['The deities Apollo and Demetra were worshipped there.']
sentence_inputs = tokenizer([sentence for sentence in sentences], return_tensors="pt", padding=True)
sentence_inputs = sentence_inputs.to('cuda')

In [23]:
preds = model.generate(
              sentence_inputs['input_ids'],
              do_sample=False, 
              max_length=max_length, 
              num_beams = 8,
              num_beam_groups = 4,
              diversity_penalty = 2.0,
              early_stopping=True,
              num_return_sequences=4
              )

In [27]:
generated_questions = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [28]:
generated_questions = list(set(generated_questions))

In [29]:
generated_questions

['Apollo and Demetra worshiped there.',
 'Where were the deities Apollo and Demetra worshiped?',
 'Where were the deities Apollo and Demetra worshipped?',
 'Where were Apollo and Demetra worshipped?']

In [21]:
tokenizer.save_pretrained('./FineTunedQuestionGeneration');
model.save_pretrained('./FineTunedQuestionGeneration');

In [35]:
text = 'Hello there from 12 BC ancient temple.'

In [36]:
text

'Hello there from 12 BC ancient temple.'