In [1]:
import json

with open('train-v2.0.json') as f:
    data = json.load(f)

In [2]:
def prepare_data(data):
    articles = []
    
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                  answer = qa["answers"][0]["text"]
                
                inputs = {"context": paragraph["context"], "question": question, "answer": answer}

            
                articles.append(inputs)

    return articles

In [3]:
import pandas as pd
data = prepare_data(data)

# Create a Dataframe
data = pd.DataFrame(data)

In [4]:
import torch
from tqdm import tqdm
from torch.optim import adam
#import evaluate
from torch.utils.data import Dataset,DataLoader,RandomSampler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import  T5ForConditionalGeneration,T5TokenizerFast

In [5]:
TOKENIZER = T5TokenizerFast.from_pretrained('t5-base')
MODEL = T5ForConditionalGeneration.from_pretrained('t5-base',return_dict=True)
OPTIMIZER = torch.optim.Adam(MODEL.parameters(),lr = 0.00001)
Q_LEN = 512
T_LEN = 128
BATCH_SIZE = 4

DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

MODEL = MODEL.to(DEVICE)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
# Importing the Dataset class from PyTorch
from torch.utils.data import Dataset

# Defining a new class QA_Dataset that inherits from the Dataset class
class QA_Dataset(Dataset):
    # The constructor method for the class
    def __init__(self, tokenizer, dataframe, q_len, t_len):
        self.tokenizer = tokenizer  # The tokenizer to be used
        self.q_len = q_len  # The maximum length for the questions
        self.t_len = t_len  # The maximum length for the answers
        self.data = dataframe  # The dataframe containing the data
        self.questions = self.data["question"]  # The questions from the dataframe
        self.context = self.data["context"]  # The context from the dataframe
        self.answer = self.data['answer']  # The answers from the dataframe

    # Method to get the length of the dataset
    def __len__(self):
        return len(self.questions)  # Returns the number of questions in the dataset

    # Method to get a specific item from the dataset
    def __getitem__(self, idx):
        question = self.questions[idx]  # The question at the given index
        context = self.context[idx]  # The context at the given index
        answer = self.answer[idx]  # The answer at the given index

        # Tokenizing the question and context with the given maximum length and padding
        question_tokenized = self.tokenizer(question, context, max_length=self.q_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True)
        # Tokenizing the answer with the given maximum length and padding
        answer_tokenized = self.tokenizer(answer, max_length=self.t_len, padding="max_length", 
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True)

        # Creating a tensor from the tokenized answer's input ids
        labels = torch.tensor(answer_tokenized["input_ids"], dtype=torch.long)
        # Replacing all 0s in the labels tensor with -100
        labels[labels == 0] = -100

        # Returning a dictionary with the tokenized question's input ids, attention mask, labels, and the tokenized answer's attention mask
        return {
            "input_ids": torch.tensor(question_tokenized["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(question_tokenized["attention_mask"], dtype=torch.long),
            "labels": labels,
            "decoder_attention_mask": torch.tensor(answer_tokenized["attention_mask"], dtype=torch.long)
        }


In [7]:
# Dataloader

train_data, val_data = train_test_split(data.iloc[:1000], test_size=0.2, random_state=42)

train_sampler = RandomSampler(train_data.index)
val_sampler = RandomSampler(val_data.index)

qa_dataset = QA_Dataset(TOKENIZER, data, Q_LEN, T_LEN)

train_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=train_sampler)
val_loader = DataLoader(qa_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)

In [8]:
# Initializing the training and validation loss to 0
train_loss = 0
val_loss = 0

# Initializing the count of training and validation batches to 0
train_batch_count = 0
val_batch_count = 0

# Looping over the epochs
for epoch in range(2):
    # Setting the model to training mode
    MODEL.train()
    
    # Looping over the training data loader
    for batch in tqdm(train_loader, desc="Training batches"):
        # Moving the batch data to the device
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        # Forward pass through the model
        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        # Zeroing the gradients
        OPTIMIZER.zero_grad()
        
        # Backward pass to calculate the gradients
        outputs.loss.backward()
        
        # Updating the weights
        OPTIMIZER.step()
        
        # Accumulating the training loss
        train_loss += outputs.loss.item()
        
        # Incrementing the count of training batches
        train_batch_count += 1
    
    # Setting the model to evaluation mode
    MODEL.eval()
    
    # Looping over the validation data loader
    for batch in tqdm(val_loader, desc="Validation batches"):
        # Moving the batch data to the device
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        decoder_attention_mask = batch["decoder_attention_mask"].to(DEVICE)

        # Forward pass through the model
        outputs = MODEL(
                          input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels,
                          decoder_attention_mask=decoder_attention_mask
                        )

        # Zeroing the gradients
        OPTIMIZER.zero_grad()
        
        # Backward pass to calculate the gradients
        outputs.loss.backward()
        
        # Updating the weights
        OPTIMIZER.step()
        
        # Accumulating the validation loss
        val_loss += outputs.loss.item()
        
        # Incrementing the count of validation batches
        val_batch_count += 1
        
    # Printing the average training and validation loss for the epoch
    print(f"{epoch+1}/{2} -> Train loss: {train_loss / train_batch_count}\tValidation loss: {val_loss/val_batch_count}")

Training batches: 100%|███████████████████████| 200/200 [06:22<00:00,  1.91s/it]
Validation batches: 100%|███████████████████████| 50/50 [01:28<00:00,  1.77s/it]


1/2 -> Train loss: 0.11002866379916668	Validation loss: 0.048757441826164725


Training batches: 100%|███████████████████████| 200/200 [06:35<00:00,  1.98s/it]
Validation batches: 100%|███████████████████████| 50/50 [01:29<00:00,  1.79s/it]

2/2 -> Train loss: 0.07740694308420643	Validation loss: 0.03301481024478562





In [9]:
import evaluate

def predict_answer(context, question, ref_answer=None):
    inputs = TOKENIZER(question, context, max_length=Q_LEN, padding="max_length", truncation=True, add_special_tokens=True)
    
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(DEVICE).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(DEVICE).unsqueeze(0)

    outputs = MODEL.generate(input_ids=input_ids, attention_mask=attention_mask)
  
    predicted_answer = TOKENIZER.decode(outputs.flatten(), skip_special_tokens=True)
    
    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer], 
                            references=[ref_answer])
    
        print("Context: \n", context)
        print("\n")
        print("Question: \n", question)
        return {
            "Reference Answer: ": ref_answer, 
            "Predicted Answer: ": predicted_answer, 
            "BLEU Score: ": score
        }
    else:
        return predicted_answer

In [12]:
context = data.iloc[0]['context']
answer = data.iloc[0]['answer']
question = data.iloc[0]['question']

predict_answer(context, question, answer)

Context: 
 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


Question: 
 When did Beyonce start becoming popular?


{'Reference Answer: ': 'in the late 1990s',
 'Predicted Answer: ': 'late 1990s',
 'BLEU Score: ': {'google_bleu': 0.3}}

In [14]:
context = data.iloc[100]['context']
answer = data.iloc[100]['answer']
question = data.iloc[100]['question']

predict_answer(context, question, answer)

Context: 
 The remaining band members recorded "Independent Women Part I", which appeared on the soundtrack to the 2000 film, Charlie's Angels. It became their best-charting single, topping the U.S. Billboard Hot 100 chart for eleven consecutive weeks. In early 2001, while Destiny's Child was completing their third album, Beyoncé landed a major role in the MTV made-for-television film, Carmen: A Hip Hopera, starring alongside American actor Mekhi Phifer. Set in Philadelphia, the film is a modern interpretation of the 19th century opera Carmen by French composer Georges Bizet. When the third album Survivor was released in May 2001, Luckett and Roberson filed a lawsuit claiming that the songs were aimed at them. The album debuted at number one on the U.S. Billboard 200, with first-week sales of 663,000 copies sold. The album spawned other number-one hits, "Bootylicious" and the title track, "Survivor", the latter of which earned the group a Grammy Award for Best R&B Performance by a Duo 

{'Reference Answer: ': 'eleven',
 'Predicted Answer: ': 'eleven',
 'BLEU Score: ': {'google_bleu': 1.0}}