In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./100_Unique_QA_Dataset.csv')

In [3]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [6]:
#tokenize each word
def token(text):
    text = text.lower()
    text= text.replace('?', '')
    text= text.replace("'", "")
    return text.split()

In [7]:
#vocabulary
vocab = {'<UNK>':0}

In [18]:
def buildvocab(row):
    tokenizequestion = token(row['question'])
    tokenizeanswer = token(row['answer'])
    merge = tokenizequestion + tokenizeanswer
    # print(merge)
    for tokens in merge:
        if tokens not in vocab:
            vocab[tokens] = len(vocab)

In [19]:
df.apply(buildvocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [26]:
def texttoindices(text,vocab):
    indtxt=[]
    for tokens in token(text):
        if tokens in vocab:
            indtxt.append(vocab[tokens])
        else:
            indtxt.append(vocab['<UNK>'])
    return indtxt

In [28]:
texttoindices('what is ert',vocab)

[1, 2, 0]

In [29]:
import torch
from torch.utils.data import DataLoader,Dataset

In [30]:
class QAdataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self, idx):
        numericalquestion = texttoindices(self.df.iloc[idx]['question'],self.vocab)
        numericalanswer = texttoindices(self.df.iloc[idx]['answer'],self.vocab)
        return torch.tensor(numericalquestion),torch.tensor(numericalanswer)

In [31]:
dataset = QAdataset(df,vocab)

In [32]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [35]:
import torch.nn as nn

In [45]:
class RNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embed=nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn = nn.RNN(50,64,batch_first=True)# batch first is made true bcz we need 1,1,324 as ouput in next layer else we will get error
        self.out = nn.Linear(64,vocab_size)
    def forward(self,question):
        q = self.embed(question)
        hidden,final = self.rnn(q)
        output = self.out(final.squeeze(0))# squeezing the final layer so that we can get 1,324 which has only one dimension of final output
        return output


In [46]:
lr = 0.001
epochs = 20

In [47]:
model = RNN(len(vocab))

In [48]:
criteri = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)

In [50]:
for epoch in range(epochs):
    totalloss =0
    for q,ans in dataloader:
        optimizer.zero_grad()
        outputs = model(q)
        loss = criteri(outputs,ans[0])
        loss.backward()
        optimizer.step()
        totalloss += loss.item()
    print(f'epoch {epoch}, total loss: {totalloss/len(dataloader)}')

epoch 0, total loss: 0.11130448633597957
epoch 1, total loss: 0.09779223824540774
epoch 2, total loss: 0.0864603472666608
epoch 3, total loss: 0.0771029078712066
epoch 4, total loss: 0.0691293389019039
epoch 5, total loss: 0.06223727423283789
epoch 6, total loss: 0.056163932445148626
epoch 7, total loss: 0.05108973539123932
epoch 8, total loss: 0.046619675784475276
epoch 9, total loss: 0.04274657995750507
epoch 10, total loss: 0.039196566575103335
epoch 11, total loss: 0.03606103153692351
epoch 12, total loss: 0.033253708937101896
epoch 13, total loss: 0.030773752513859005
epoch 14, total loss: 0.02850962748958005
epoch 15, total loss: 0.026466217761238416
epoch 16, total loss: 0.024630223411238855
epoch 17, total loss: 0.022986217971063323
epoch 18, total loss: 0.021413168062766393
epoch 19, total loss: 0.02004377781930897


In [59]:
def predict(model,question,th=0.5):
    q = texttoindices(question,vocab)
    qtensor = torch.tensor(q).unsqueeze(0)#converting into tensor and unsqueezing it to add dimension
    outputs = model(qtensor) # we get logits as outputs here
    prob = torch.nn.functional.softmax(outputs, dim=1) # using softmax to get prob of logits
    value ,index = torch.max(prob,dim=1) # find index of max prob
    if value.item() < th:
        print('no answer')
    print(list(vocab.keys())[index])


In [62]:
predict(model,'what is capital of india')

delhi
