In [2]:
import pandas as pd

df=pd.read_csv("100_Unique_QA_Dataset.csv")

df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [20]:
# tokenization
def tokenization(text):
    text=text.lower()
    text=text.replace("?","")
    text=text.replace("'","")
    return text.split()
#vocab
vocab={'<UNK>':0}
def build_vocab(row):
    tokenized_question=tokenization(row['question'])
    tokenized_answer=tokenization(row['answer'])

    merged_token=tokenized_question+tokenized_answer
    for token in merged_token:
        if token not in vocab:
            vocab[token]=len(vocab)
    print(merged_token)
df.apply(build_vocab,axis=1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [21]:
#convert words to numerical indices
def text_to_indices(text,vocab):
    indexed_text=[]
    for token in tokenization(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [22]:
import torch
from torch.utils.data import Dataset,DataLoader

In [26]:
class QAdataset(Dataset):
    def __init__(self,df,vocab):
        self.df=df
        self.vocab=vocab
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,index):
        numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
        numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

        return torch.tensor(numerical_question),torch.tensor(numerical_answer)

In [31]:
ds=QAdataset(df,vocab)
dataloader=DataLoader(ds,batch_size=1,shuffle=True)

In [43]:
for que,ans in dataloader:
    print(que,ans[0])

tensor([[ 10,  11, 157, 158, 159]]) tensor([160])
tensor([[ 78,  79, 129,  81,  19,   3,  21,  22]]) tensor([36])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([23])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([85])
tensor([[ 78,  79, 288,  81,  19,  14, 289]]) tensor([85])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[ 42, 137,   2,  62,  39,   3, 322, 323]]) tensor([6])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([113])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([16])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([41])
tensor([[ 42,  18, 118,   3, 186, 187]]) tensor([188])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([205])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([72])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([260])
tensor([[10, 29,  3, 30, 31]]) tensor([32])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([184])
tensor([[ 10,  75,   3, 296,  19, 297]]) tensor([298])
tensor([[ 42, 107,   2, 108,  19, 109]

In [44]:
import torch.nn as nn

In [51]:
class SimpleRNN(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
        self.rnn=nn.RNN(50,64,batch_first=True)
        self.fc=nn.Linear(64,vocab_size)

    def forward(self,que):
        embedded_question=self.embedding(que)
        hidden,final=self.rnn(embedded_question)
        output=self.fc(final.squeeze(0))
        return output

In [52]:
lr=0.01
epochs=100

In [53]:
model=SimpleRNN(len(vocab))

In [54]:
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [55]:
# training loop
for e in range(epochs):
    total_loss=0
    for que,ans in dataloader:
        optimizer.zero_grad()
        #forward pass
        op=model(que)
        #loss
        loss=criterion(op,ans[0])

        loss.backward()

        optimizer.step()
        
        total_loss+=loss.item()
    print(f"Epochs {e+1}, Loss {total_loss}")

Epochs 1, Loss 534.3003540039062
Epochs 2, Loss 359.413573384285
Epochs 3, Loss 174.77671975642443
Epochs 4, Loss 62.9094940405339
Epochs 5, Loss 41.13317562290467
Epochs 6, Loss 25.0282109095715
Epochs 7, Loss 16.576856184285134
Epochs 8, Loss 13.005655823741108
Epochs 9, Loss 16.494099577423185
Epochs 10, Loss 9.585796236526221
Epochs 11, Loss 3.589154765708372
Epochs 12, Loss 5.945767092285678
Epochs 13, Loss 3.7736706319265068
Epochs 14, Loss 1.6536175541114062
Epochs 15, Loss 2.0943391039036214
Epochs 16, Loss 19.738310769491363
Epochs 17, Loss 27.153645448586758
Epochs 18, Loss 35.223345285630785
Epochs 19, Loss 16.977714795826614
Epochs 20, Loss 6.916762895261854
Epochs 21, Loss 3.7355947054784338
Epochs 22, Loss 9.933460412274144
Epochs 23, Loss 7.232449907827686
Epochs 24, Loss 10.692850092476874
Epochs 25, Loss 13.066044719358615
Epochs 26, Loss 2.5339125235332176
Epochs 27, Loss 6.508925389054639
Epochs 28, Loss 3.277987102213956
Epochs 29, Loss 4.535110365737637
Epochs 30, 

In [60]:
def predict(model,que,threshold=0.5):
    num_que=text_to_indices(que,vocab)
    que_tensor=torch.tensor(num_que)
    op=model(que_tensor)
    probs=torch.nn.functional.softmax(op,dim=1)
    value,index=torch.max(probs,dim=1)

    if value<threshold:
        print("I don't know")
    list(vocab.key()[index])

In [61]:
predict(model,"What is the france current capital?")

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)