In [1]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config,  AutoTokenizer, AutoModelWithLMHead

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
!python3 --version

Python 3.8.8


In [5]:
train = pd.read_csv('~/qg_dataset/hotpot_train_fullcontext_v1.1.csv')
# train = train[40001:]
valid = pd.read_csv('~/qg_dataset/hotpot_dev_distractor_fullcontext_v1.csv')
valid = valid[2001:]

train.head()

Unnamed: 0.1,Unnamed: 0,question,text
0,5a7a06935542990198eaf050,Which magazine was started first Arthur's Maga...,<answer> Arthur's Magazine <context> Arthur's ...
1,5a879ab05542996e4f30887e,The Oberoi family is part of a hotel company t...,<answer> Delhi <context> The Oberoi family is ...
2,5a8d7341554299441c6b9fe5,Musician and satirist Allie Goertz wrote a son...,<answer> President Richard Nixon <context> All...
3,5a82171f5542990a1d231f4a,What nationality was James Henry Miller's wife?,<answer> American <context> Margaret Peggy See...
4,5a84dd955542997b5ce3ff79,Cadmium Chloride is slightly soluble in this c...,<answer> alcohol <context> Cadmium chloride is...


In [6]:
PRETRAINED_MODEL = 't5-base'
DIR = "question_generator/"
BATCH_SIZE = 1
SEQ_LENGTH = 600

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
)
class QGDataset(Dataset):
    def __init__(self, csv):
        self.df = csv

    def __len__(self):
         return len(self.df)

    def __getitem__(self, idx):   
        if torch.is_tensor(idx):
            idx = idx.tolist()
        row = self.df.iloc[idx, 1:]       

        encoded_text = tokenizer(
            row['text'], 
            padding=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        )
        encoded_text['input_ids'] = torch.squeeze(encoded_text['input_ids'])
        encoded_text['attention_mask'] = torch.squeeze(encoded_text['attention_mask'])

        encoded_question = tokenizer(
            row['question'],
            padding=True,
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors='pt'
        )
        encoded_question['input_ids'] = torch.squeeze(encoded_question['input_ids'])

        return (encoded_text.to(device), encoded_question.to(device))

train_set = QGDataset(train)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
valid_set = QGDataset(valid)
valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
# Calculating token length
# import matplotlib.pyplot as plt
# import seaborn as sns

# token_lens = []
# for txt in train.text:
#   tokens = tokenizer.encode(txt, max_length=1024)
#   token_lens.append(len(tokens))

# sns.distplot(token_lens)
# plt.xlim([0, 1024]);
# plt.xlabel('Token lengths');

In [8]:
SAVED_MODEL_PATH = "/home2/samyak.ja/qg_dataset/model_hotpot.pth"
TEMP_SAVE_PATH = "/home2/samyak.ja/qg_dataset/model_hotpot.pth"

def evaluate(eval_model, data_loader):
    eval_model.eval()
    total_loss = 0.
    with torch.no_grad():
        for batch_index, batch in enumerate(data_loader):
            data, target = batch
            masked_labels = mask_label_padding(target['input_ids'])
            output = eval_model(
                input_ids=data['input_ids'],
                attention_mask=data['attention_mask'],
                labels=masked_labels
            )
            total_loss += output[0].item()
    return total_loss / len(data_loader)

def mask_label_padding(labels):
    MASK_ID = -100
    labels[labels==tokenizer.pad_token_id] = MASK_ID
    return labels

def load(path):
    return torch.load(path)

def print_line():
    LINE_WIDTH = 60
    print('-' * LINE_WIDTH)

In [8]:
md2 = torch.load("/home2/samyak.ja/qg_dataset/model_hotpot_last.pth")

In [9]:
def inference(review_text, model, device):
    encoded_text = tokenizer(
            review_text, 
            padding=True, 
            max_length=SEQ_LENGTH,
            truncation=True,
            return_tensors="pt"
        ).to(device)

    input_ids = encoded_text['input_ids']
    with torch.no_grad():
        output = model.generate(input_ids)
    decoded_string = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_string

In [10]:
file_output = "/home2/samyak.ja/qg_dataset/predicted_full_context.csv"
csvfile = open(file_output, 'w')
csvfile.write("question,predicted"+"\n")

for i in range(0, len(valid)):
    test_output= valid.iloc[i, 1]
    review_text = valid.iloc[i, 2]
#     print(review_text)
    predicted = inference(review_text, md2, device)
    line1 = test_output + "," + predicted
    csvfile.write(line1+"\n")
    
    if i%500 ==0:
        print(i)
csvfile.close()

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
