In [5]:
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()

In [6]:
import pandas as pd
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv' )
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [7]:
vocab = {'<UNK>' : 0}

In [8]:
def build(row):
  print(row['question'] , row['answer'])
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])
  merged_tokenized = tokenized_question + tokenized_answer
  for token in merged_tokenized:
    if token not in vocab:
      vocab[token] = len(vocab)
  print(merged_tokenized)


In [9]:
df.apply(build , axis=1)

What is the capital of France? Paris
['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
What is the capital of Germany? Berlin
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
Who wrote 'To Kill a Mockingbird'? Harper-Lee
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
What is the largest planet in our solar system? Jupiter
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
What is the boiling point of water in Celsius? 100
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
Who painted the Mona Lisa? Leonardo-da-Vinci
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
What is the square root of 64? 8
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
What is the chemical symbol for gold? Au
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
Which year did World War II end? 1945
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
What is the longe

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [44]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [11]:
def text_indices(text , vocab):
  indexed_text = []

  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [12]:
text_indices('what is the capital of india' , vocab)

[1, 2, 3, 4, 5, 73]

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

In [15]:
class customdataset(Dataset):
  def __init__(self , df , vocab):
    self.df = df
    self.vocab = vocab
  def __len__(self):
    return self.df.shape[0]
  def __getitem__(self , idx):
    numerical_question =  text_indices(self.df.iloc[idx]['question'] , self.vocab)
    numerical_answer = text_indices(self.df.iloc[idx]['answer'] , self.vocab)
    return torch.tensor(numerical_question) , torch.tensor(numerical_answer)

In [16]:
dataset = customdataset(df , vocab)

In [17]:
dataset[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

In [18]:
dataloader = DataLoader(dataset , batch_size = 1 , shuffle = True)

In [19]:
for question , answer in dataloader:
  print(question , answer)

tensor([[ 78,  79, 129,  81,  19,   3,  21,  22]]) tensor([[36]])
tensor([[ 10,  96,   3, 104, 239]]) tensor([[240]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tensor([[ 42, 255,   2, 256,  83, 257, 258]]) tensor([[259]])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([[215]])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[  1,   2,   3, 146,  86,  19, 192, 193]]) tensor([[194]])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([[9]])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([[65]])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([[72]])
tensor([[ 42,   2,   3, 274, 211, 275]]) tensor([[276]])
tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([[128]])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([[295]])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([[249]])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([[166]])
tensor([[ 10,  11, 189, 158, 190]]) tensor([[191]])
tensor([[10, 96,  3, 97]]) tensor([[98

In [21]:
import torch.nn as nn

In [38]:
class myrrn(nn.Module):
  def __init__(self , vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size ,embedding_dim=50 )
    self.rnn = nn.RNN(50 , 64 , batch_first=True)
    self.fc = nn.Linear(64 , vocab_size)

  def forward(self , question):
    embedded_question = self.embedding(question)
    hidden , final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))
    return output

In [39]:
model = myrrn(len(vocab))
epoch = 25
learning_rate = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters() , lr = learning_rate)

In [41]:
for epoch in range(epochs):
  total_loss = 0
  for question , answer in dataloader:
    optimizer.zero_grad()

    output = model(question)

    loss = criterion(output , answer[0])

    loss.backward()

    optimizer.step()

  total_loss += loss.item()
  print(f"epochs {epoch+1} , loss : {total_loss}")

epochs 1 , loss : 18.672677993774414
epochs 2 , loss : 26.353595733642578
epochs 3 , loss : 39.13314437866211
epochs 4 , loss : 17.811885833740234
epochs 5 , loss : 19.35320281982422
epochs 6 , loss : 30.85173988342285
epochs 7 , loss : 8.11328411102295
epochs 8 , loss : 0.00039414744242094457
epochs 9 , loss : 7.507776260375977
epochs 10 , loss : 8.34461570775602e-06
epochs 11 , loss : 0.0
epochs 12 , loss : 0.0
epochs 13 , loss : 9.651334762573242
epochs 14 , loss : 1.0167295932769775
epochs 15 , loss : 0.0
epochs 16 , loss : 0.0
epochs 17 , loss : 0.0
epochs 18 , loss : 0.00019035911827813834
epochs 19 , loss : 0.0
epochs 20 , loss : 0.0
epochs 21 , loss : 0.0
epochs 22 , loss : 0.0
epochs 23 , loss : 0.06441496312618256
epochs 24 , loss : 66.85706329345703
epochs 25 , loss : 0.0


In [42]:
def predictor(model , question , threshold = 0.5):
  numerical_question = text_indices(question , vocab)

  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  output = model(question_tensor)

  prob = nn.functional.softmax(output , dim = 1)

  value , index = torch.max(prob , dim=1)

  if value < threshold:
    print('i dont know')
  else:
    print(list(vocab.keys())[index])



In [43]:
predictor(model , "what is capital of france")

paris
