In [235]:
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [236]:
device

device(type='cpu')

In [237]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')

In [238]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [239]:
df.shape

(90, 2)

In [240]:
def Tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'",'')
  text = text.split(' ')
  return text

In [241]:
Tokenize("What are you doing?")

['what', 'are', 'you', 'doing']

In [242]:
def GetVocab(df):
  vocab = {'<UNK>': 0}
  for _, row in df.iterrows():
    ques = Tokenize(row['question'])
    ans = Tokenize(row['answer'])

    for word in ans:
      if word not in vocab:
        vocab[word] = len(vocab)

    for word in ques:
      if word not in vocab:
        vocab[word] =  len(vocab)

  return vocab

In [243]:
vocab = GetVocab(df)
vocab_size = len(vocab)
print("Vocabulary size = ", vocab_size)

Vocabulary size =  324


In [244]:
vocab

{'<UNK>': 0,
 'paris': 1,
 'what': 2,
 'is': 3,
 'the': 4,
 'capital': 5,
 'of': 6,
 'france': 7,
 'berlin': 8,
 'germany': 9,
 'harper-lee': 10,
 'who': 11,
 'wrote': 12,
 'to': 13,
 'kill': 14,
 'a': 15,
 'mockingbird': 16,
 'jupiter': 17,
 'largest': 18,
 'planet': 19,
 'in': 20,
 'our': 21,
 'solar': 22,
 'system': 23,
 '100': 24,
 'boiling': 25,
 'point': 26,
 'water': 27,
 'celsius': 28,
 'leonardo-da-vinci': 29,
 'painted': 30,
 'mona': 31,
 'lisa': 32,
 '8': 33,
 'square': 34,
 'root': 35,
 '64': 36,
 'au': 37,
 'chemical': 38,
 'symbol': 39,
 'for': 40,
 'gold': 41,
 '1945': 42,
 'which': 43,
 'year': 44,
 'did': 45,
 'world': 46,
 'war': 47,
 'ii': 48,
 'end': 49,
 'nile': 50,
 'longest': 51,
 'river': 52,
 'tokyo': 53,
 'japan': 54,
 'albert-einstein': 55,
 'developed': 56,
 'theory': 57,
 'relativity': 58,
 '32': 59,
 'freezing': 60,
 'fahrenheit': 61,
 'mars': 62,
 'known': 63,
 'as': 64,
 'red': 65,
 'george-orwell': 66,
 'author': 67,
 '1984': 68,
 'pound': 69,
 'currenc

In [245]:
def TextToIndex(text, vocab):
  idxes = []
  for word in Tokenize(text):
    if word in vocab:
      idxes.append(vocab[word])
    else:
      idxes.append(vocab['<UNK>'])

  return idxes

In [246]:
TextToIndex("What are you doing?", vocab)

[2, 82, 0, 0]

In [247]:
class CustomDataset(Dataset):
  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, idx):
    question = self.df.iloc[idx]['question']
    answer = self.df.iloc[idx]['answer']

    numerical_ques = TextToIndex(question, self.vocab)
    numerical_ans = TextToIndex(answer, self.vocab)

    return torch.tensor(numerical_ques), torch.tensor(numerical_ans)

In [248]:
data = CustomDataset(df, vocab)

In [249]:
batch_size = 1
train_loader = DataLoader(data, batch_size = batch_size, shuffle = True)

In [278]:
class RNNNetwork(nn.Module):
  def __init__(self, vocab_size):
    super(RNNNetwork, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, 50)
    self.rnn1 = nn.RNN(50, 64, batch_first=True) # batch_first = True is important as by default rnn takes 2nd element in shape as batch_size
    # which would result in output shape of (1, 6, 324) while we actually need (1, 1, 324)
    self.rnn2 = nn.RNN(64, 32, batch_first=True)
    self.fc = nn.Linear(32, vocab_size)

  def forward(self, X):
    x = self.embeddings(X)
    hidden, _ = self.rnn1(x)
    hidden, final = self.rnn2(hidden)
    result = self.fc(final.squeeze(0))

    return result

In [279]:
model = RNNNetwork(vocab_size=vocab_size)

In [280]:
epoches = 50
lr = 0.01

In [281]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [282]:
ques, ans = next(iter(train_loader))

In [283]:
for epoch in range(epoches):
  losses = []
  total_loss = 0
  for ques, ans in (train_loader):
    y = model(ques)
    loss = loss_fn(y, ans[0])
    total_loss += loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  losses.append(total_loss/len(train_loader))
  print(f"For epoch {epoch} loss is = {total_loss/len(train_loader)}")

For epoch 0 loss is = 6.055722533331977
For epoch 1 loss is = 5.104307360119289
For epoch 2 loss is = 4.527994441986084
For epoch 3 loss is = 3.7568561183081735
For epoch 4 loss is = 3.0363816605673897
For epoch 5 loss is = 2.3815154559082456
For epoch 6 loss is = 2.0227703785730733
For epoch 7 loss is = 1.3425842677553494
For epoch 8 loss is = 1.0798106542891925
For epoch 9 loss is = 0.7939035400748253
For epoch 10 loss is = 0.6001515850838688
For epoch 11 loss is = 0.5237902238654594
For epoch 12 loss is = 0.3624399169037739
For epoch 13 loss is = 0.2601043551332421
For epoch 14 loss is = 0.2889236038136813
For epoch 15 loss is = 0.21791919148009684
For epoch 16 loss is = 0.1445754893331064
For epoch 17 loss is = 0.1153852140944865
For epoch 18 loss is = 0.1297211481879155
For epoch 19 loss is = 0.10257399565436774
For epoch 20 loss is = 0.10855210337580906
For epoch 21 loss is = 0.1079491542548769
For epoch 22 loss is = 0.054477929970663454
For epoch 23 loss is = 0.05477425753552881

In [284]:
def predict(model, ques, vocab, threshold = 0.5):
  numerical_ques = torch.tensor(TextToIndex(ques, vocab)).unsqueeze(0)
  y = model(numerical_ques)
  y_cap = F.softmax(y, dim = 1)
  prob, idx = torch.max(y_cap, dim = 1)

  if prob.item() >= threshold:
    print(list(vocab.keys())[idx])
  else:
    print("I don't know")

In [285]:
print(df['question'][3])
predict(model, df['question'][3], vocab)

What is the largest planet in our solar system?
jupiter


In [286]:
print(df['question'][4])
predict(model, df['question'][4], vocab)

What is the boiling point of water in Celsius?
100


In [287]:
print(df['question'][6])
predict(model, df['question'][6], vocab)

What is the square root of 64?
8
