In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [18]:
df=pd.read_csv("/content/100_Unique_QA_Dataset.csv")

In [19]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [20]:
#tokenize
def tokenize(text):
    text=text.lower()
    text=text.replace('?','')
    text=text.replace("'","")
    return text.split()

In [21]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [28]:
#vocab
vocab={"<UNK>":0}
def build_vocab(row):
    tokenize_question=tokenize(row["question"])
    tokenize_answer=tokenize(row["answer"])
    merged_token=tokenize_question+tokenize_answer
    for token in merged_token:
        if token not in vocab:
            vocab[token]=len(vocab)

In [None]:
df.apply(build_vocab,axis=1)

In [None]:
vocab

In [31]:
# convert words to numerical
def text_to_indexes(text,vocab):
    indexed_text=[]
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text


In [32]:
import torch
from torch.utils.data import Dataset,DataLoader

In [33]:
class CustomDataset(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab=vocab
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,index):
        numerical_question=text_to_indexes(self.df.iloc[index]["question"],self.vocab)
        numerical_answer=text_to_indexes(self.df.iloc[index]["answer"],self.vocab)
        return torch.tensor(numerical_question),torch.tensor(numerical_answer)

In [34]:
dataset=CustomDataset(df,vocab)

In [37]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [None]:
# for question, answer in dataloader:
#     print(question,answer)

In [39]:
import torch.nn as nn
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn=nn.RNN(50,64,batch_first=True)
        self.linear=nn.Linear(64,vocab_size)
    def forward(self,question):
        embedded=self.embedding(question)
        hidden,final=self.rnn(embedded)
        output=self.linear(final.squeeze(0))
        return output

In [44]:
lr=0.001
epochs=20
model=SimpleRNN(len(vocab))
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [45]:
criterion=nn.CrossEntropyLoss()

In [51]:
for epoch in range(epochs):
    total_loss=0
    for x,y in dataloader:
        out=model(x)
        loss=criterion(out,y[0])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    print(total_loss)

522.4060192108154
456.23117780685425
379.06891441345215
314.69608879089355
261.60399889945984
212.80555629730225
168.1522206068039
130.03252637386322
100.07526537775993
75.88354751467705
58.37764164805412
45.437573701143265
36.24743765592575
29.78953191637993
24.05648773908615
20.054137349128723
16.86733713746071
14.312653914093971
12.39391229301691
10.722581394016743


In [69]:
def predict_model(model,text,threshold=0.5):

    question=text_to_indexes(text,vocab)

    question_tensor=torch.tensor(question).unsqueeze(0)

    outputs=model(question_tensor)

    probs=torch.nn.functional.softmax(outputs,dim=1)

    value,index=torch.max(probs,1)

    if value<threshold:
        print("I don't know")
    else:
        print(list(vocab.keys())[index])

In [71]:
predict_model(model,"what is the capital city of france")

paris
