In [45]:
import pandas as pd
df=pd.read_csv('/content/100_Unique_QA_Dataset.csv')

In [46]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [47]:
# tokenize
def tokenize(text):
  text=text.lower()
  text=text.replace('?',' ')
  text=text.replace("'"," ")
  return text.split()

In [48]:
#vocab use unique word
vocab={
    '<UNK>':0
}

In [49]:
def build_vocab(row):
  tok_question=tokenize(row['question'])
  tok_answer=tokenize(row['answer'])
  merge_token=tok_question+tok_answer
  for token in merge_token:
    if token not in vocab:
      vocab[token]=len(vocab)

In [50]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [51]:
len(vocab)

324

In [52]:
# convert word numerical indices
def text_to_indices(text,vocab):
  indexed_text=[]
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [53]:
text_to_indices('what is Aniket',vocab)

[1, 2, 0]

In [54]:
import torch
from torch.utils.data import Dataset,DataLoader

In [55]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(numerical_question),torch.tensor(numerical_answer)



In [56]:
dataset=QADataset(df,vocab)
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [57]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)


In [58]:
for question,answer in dataloader:
  print(question,answer[0])

tensor([[10, 55,  3, 56,  5, 57]]) tensor([58])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[10, 75, 76]]) tensor([77])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([95])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([249])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([260])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([246])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([149])
tensor([[ 10,  75, 208]]) tensor([209])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([295])
tensor([[ 10,  11, 189, 158, 190]]) tensor([191])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]]) tensor([173])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([36])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([179])
tensor([[ 1,  2,  3, 17, 18, 19, 20, 21, 22]]) tensor([23])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([65])
tensor([[  1,   2,   3,   4,   5, 206]]) tensor([207])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) 

In [59]:
# rnn architecture
import torch.nn as nn
class SimpleRNN(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim=50)#per word cvt in 50 dim array
    self.rnn=nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,vocab_size)

  def forward(self,question):
    embedded_question=self.embedding(question)
    hidden,final=self.rnn(embedded_question)
    output=self.fc(final.squeeze(0))
    return output

In [60]:
learning_rate=0.001
epochs=20


In [61]:
model=SimpleRNN(len(vocab))


In [62]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):  # <-- Use range(epochs)
    total_loss = 0
    for question, answer in dataloader:
        optimizer.zero_grad()
        # Forward pass
        output = model(question)
        # Compute loss
        loss = criterion(output, answer[0])
        # Backward pass
        loss.backward()
        # Update weights
        optimizer.step()
        total_loss += loss.item()

    print(f'epoch: {epoch+1}, loss: {total_loss:.4f}')


epoch: 1, loss: 527.4991
epoch: 2, loss: 457.2763
epoch: 3, loss: 379.3481
epoch: 4, loss: 320.1532
epoch: 5, loss: 269.1102
epoch: 6, loss: 220.4661
epoch: 7, loss: 175.8781
epoch: 8, loss: 137.3491
epoch: 9, loss: 105.7649
epoch: 10, loss: 80.6221
epoch: 11, loss: 61.6003
epoch: 12, loss: 48.2320
epoch: 13, loss: 37.9235
epoch: 14, loss: 30.9395
epoch: 15, loss: 25.3254
epoch: 16, loss: 20.9970
epoch: 17, loss: 17.6967
epoch: 18, loss: 15.1873
epoch: 19, loss: 13.1378
epoch: 20, loss: 11.3303


In [72]:
def predict(model,question,threshold=0.5):
  # cvt que to number
  numerical_question=text_to_indices(question,vocab)
  # tensor
  question_tensor=torch.tensor(numerical_question).unsqueeze(0)
  #send to model
  output=model(question_tensor)
  #cvt logits to probs
  probs=torch.nn.functional.softmax(output,dim=1)

  # find index of max prob
  value,index=torch.max(probs,dim=1)
  if value<threshold:
    print('i dont know')
  print(list(vocab.keys())[index])



In [73]:
predict(model, 'What is the capital of France?')

paris
