In [37]:
import pandas as pd

df=pd.read_csv("100_Unique_QA_Dataset.csv")

df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [38]:
# tokenization
def tokenization(text):
    text=text.lower()
    text=text.replace("?","")
    text=text.replace("'","")
    return text.split()
#vocab
vocab={'<UNK>':0}
def build_vocab(row):
    tokenized_question=tokenization(row['question'])
    tokenized_answer=tokenization(row['answer'])

    merged_token=tokenized_question+tokenized_answer
    for token in merged_token:
        if token not in vocab:
            vocab[token]=len(vocab)
    print(merged_token)
df.apply(build_vocab,axis=1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [39]:
#convert words to numerical indices
def text_to_indices(text,vocab):
    indexed_text=[]
    for token in tokenization(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [40]:
import torch
from torch.utils.data import Dataset,DataLoader

In [41]:
class QAdataset(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab=vocab
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,index):
        numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
        numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

        return torch.tensor(numerical_question),torch.tensor(numerical_answer)

In [42]:
ds=QAdataset(df,vocab)
dataloader=DataLoader(ds,batch_size=1,shuffle=True)

In [43]:
for que,ans in dataloader:
    print(que,ans[0])

tensor([[10, 11, 12, 13, 14, 15]]) tensor([16])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([246])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([188])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([91])
tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([128])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([131])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[  1,   2,   3, 122, 123,  19,   3,  45]]) tensor([124])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([53])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([249])
tensor([[ 42, 318,   2,  62,  63,   3, 319,   5, 320]]) tensor([321])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([273])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([9])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([121])
tensor([[ 1,  2,  3,  4,  5, 99]]) tensor([100])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([194])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([58])
tensor([[  1,   2,   3,  37,  

In [44]:
import torch.nn as nn

In [45]:
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn=nn.RNN(50,64,batch_first=True)
        self.fc=nn.Linear(64,vocab_size)

    def forward(self,que):
        embedded_question=self.embedding(que)
        hidden,final=self.rnn(embedded_question)
        output=self.fc(final.squeeze(0))
        return output

In [56]:
lr=0.01
epochs=200

In [57]:
model=SimpleRNN(len(vocab))

In [58]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [59]:
# training loop
for e in range(epochs):
    total_loss=0
    for que,ans in dataloader:
        optimizer.zero_grad()
        #forward pass
        op=model(que)
        #loss
        loss=criterion(op,ans[0])

        loss.backward()

        optimizer.step()
        
        total_loss+=loss.item()
    print(f"Epochs {e+1}, Loss {total_loss}")

Epochs 1, Loss 537.5186033248901
Epochs 2, Loss 318.5814647078514
Epochs 3, Loss 145.1672183573246
Epochs 4, Loss 72.75360015314072
Epochs 5, Loss 30.282578042708337
Epochs 6, Loss 34.658514665439725
Epochs 7, Loss 27.32634341204539
Epochs 8, Loss 25.37221501278691
Epochs 9, Loss 22.72959983209148
Epochs 10, Loss 12.712916251271963
Epochs 11, Loss 11.264159947866574
Epochs 12, Loss 17.20179022871889
Epochs 13, Loss 14.901815988821909
Epochs 14, Loss 14.542612265329808
Epochs 15, Loss 26.707070981385186
Epochs 16, Loss 19.35438282089308
Epochs 17, Loss 13.750045066786697
Epochs 18, Loss 11.03483790656901
Epochs 19, Loss 15.09694543943624
Epochs 20, Loss 13.173345675546443
Epochs 21, Loss 5.6671391656273045
Epochs 22, Loss 7.7220409766887315
Epochs 23, Loss 1.770843015779974
Epochs 24, Loss 3.2108779834525194
Epochs 25, Loss 10.172223000088707
Epochs 26, Loss 8.744661755568814
Epochs 27, Loss 6.42975227496936
Epochs 28, Loss 3.129518974048551
Epochs 29, Loss 2.3823428476171102
Epochs 30,

In [63]:
def predict(model,que,vocab,threshold=0.5):
    model.eval()

    num_que=text_to_indices(que,vocab)
    que_tensor=torch.tensor(num_que)
    que_tensor = que_tensor.unsqueeze(0)
    with torch.no_grad():
        op = model(que_tensor)
    probs=torch.nn.functional.softmax(op,dim=1)
    value,index=torch.max(probs,dim=1)

    confidence = value.item()
    predicted_index = index.item()
    reverse_vocab = {v: k for k, v in vocab.items()}


    if confidence < threshold:
        print("I don't know")
    else:
        # Get the predicted word using the reverse vocabulary
        predicted_word = reverse_vocab.get(predicted_index, '<UNK>')
        print(f"Predicted Answer: {predicted_word}")
        print(f"Confidence: {confidence:.2f}")
predict(model,"What is the capital of france ?",vocab)

Predicted Answer: paris
Confidence: 1.00
