In [2]:
import pandas as pd
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df.head(10)

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
5,Who painted the Mona Lisa?,Leonardo-da-Vinci
6,What is the square root of 64?,8
7,What is the chemical symbol for gold?,Au
8,Which year did World War II end?,1945
9,What is the longest river in the world?,Nile


In [9]:
# converting the english text into numbers

# tokenize -> seperating each and ever word ,so that we can allot tokens
def tokenize(text):
    text = text.lower()  # converted all the alphabets into lower letters
    text = text.replace('?' , '') # replaced ? with nothing
    text = text.replace(" ' " , "") # replaced ' with nothing
    return text.split()

In [10]:
tokenize("What is the capital of Germany?")

['what', 'is', 'the', 'capital', 'of', 'germany']

In [11]:
# vocabulary -> how many unique words are there is our dataset , and providing each unique word "index"
vocab = {'<UNK>' :0} # unknown toekn -> if in future there are some words which are not in our dataset , we will replace the respective word with the unknown token
         

In [18]:
def build_vocab(row): 
    print(row['question'] , row['answer'] )  # untokized question and answer

In [21]:
def build_vocab(row):
  tokenized_question = tokenize(row['question']) # tokenized question stored in tokenized_question
  tokenized_answer = tokenize(row['answer']) # tokenized answer stored in tokenized_answer

  merged_tokens = tokenized_question + tokenized_answer # merged the two ,  in a single list all will get printed

  for token in merged_tokens:

    if token not in vocab:      # checking that is the token available in the dictionary 
      vocab[token] = len(vocab)  # if not available , we will add the vocab


In [22]:
df.apply(build_vocab , axis = 1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [23]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 "'to": 12,
 'kill': 13,
 'a': 14,
 "mockingbird'": 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 "'1984'": 67,
 'george-orwell': 68,
 'currency': 69,
 '

In [24]:
len(vocab)

326

In [25]:
# convert words  tp numerical indices -> eg - "What" , will be replaced by its token number
def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [28]:
text_to_indices("Who is Sarvambh" , vocab) # who = 10 , is = 2 , sarvambh -> is unknown as it is not present in the dictionary hence replaced by unknown - 0

[10, 2, 0]

In [27]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 "'to": 12,
 'kill': 13,
 'a': 14,
 "mockingbird'": 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 "'1984'": 67,
 'george-orwell': 68,
 'currency': 69,
 '

In [None]:
# now we have to go to our each row and questions and answer , and convert it into numerical indices
# we will taek hellp of data set and data loaders classes

In [29]:
import torch
from torch.utils.data import Dataset , DataLoader

In [34]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab) # converted the whole question text into respecctive indices and stored it in numerical_question
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab) # converted the whole answer text into respective indices adn stored it in numerical_answer

    return torch.tensor(numerical_question), torch.tensor(numerical_answer) # converetd the respectiev indices into tensor

In [35]:
dataset = QADataset(df, vocab)

In [37]:
dataloader = DataLoader(dataset, batch_size = 1 , shuffle = True)

In [38]:
for question , answer in dataloader:
    print(question , answer)

tensor([[  1,   2,   3,  33,  34,   5, 247]]) tensor([[248]])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[186]])
tensor([[10, 29,  3, 30, 31]]) tensor([[32]])
tensor([[ 78,  79, 129,  81,  19,   3,  21,  22]]) tensor([[36]])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([[91]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tensor([[ 10,  11, 158, 159, 160]]) tensor([[161]])
tensor([[ 78,  79, 196,  81,  19,   3, 197, 198, 199]]) tensor([[200]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[262]])
tensor([[ 42, 265, 266,  14, 267, 268, 159, 269]]) tensor([[270]])
tensor([[ 42, 292, 293, 118, 294, 159, 295, 296]]) tensor([[297]])
tensor([[  1,   2,   3,  37,  38,  39, 162]]) tensor([[163]])
tensor([[ 42, 217, 118, 218, 219,  19,  14, 220,  43]]) tensor([[221]])
tensor([[  1,   2,   3,   4,   5, 288]]) tensor([[289]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[ 1,  2,  3,  4,  5, 99]]) tensor([[100]])
tensor([[ 42,  18,   2,   3, 283, 143,

In [None]:
# buiding the RNN ARCHITECTURE MODEL

In [40]:
import torch.nn as nn

In [43]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50) # converting your each word into a 50 dimensional layer  
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [44]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [45]:
x = nn.Embedding(324 , embedding_dim = 50)

In [47]:
a = x(dataset[0][0])
a

tensor([[ 1.2503e+00, -1.0524e+00,  5.6339e-01, -2.4573e-01, -1.1826e+00,
         -7.0119e-02,  8.4112e-01,  1.3265e+00,  1.7870e-01, -3.3997e-01,
          1.9113e+00, -3.1746e-01, -3.1415e+00, -8.2214e-01,  2.1212e-01,
         -1.0891e+00,  8.8661e-01,  1.6221e+00, -4.2221e-01,  9.7907e-01,
          1.8672e+00,  4.1413e-04, -5.6157e-01, -1.2633e+00,  2.7772e-01,
          9.5701e-01, -9.0328e-01,  1.6230e-01, -3.3102e-01, -1.1106e+00,
          4.4981e-02,  1.2929e+00,  1.9248e+00,  7.9192e-01,  1.2147e+00,
         -3.7983e-01,  1.8594e+00, -1.2367e-01, -1.3795e+00, -8.0557e-02,
         -1.1671e+00, -1.4057e+00,  3.5313e-02, -1.3959e+00,  3.9498e-01,
         -6.3847e-01,  2.0553e-01,  1.0085e+00, -2.8827e-01,  8.7322e-01],
        [-7.2541e-01,  7.8185e-01,  4.2582e-01, -3.5482e-01, -1.4095e-01,
         -1.0340e-01, -1.7049e+00, -1.4701e+00,  1.2551e+00,  2.0202e+00,
          8.1504e-01, -1.0414e+00, -1.9095e+00, -2.3607e+00, -2.4665e-01,
         -4.7142e-01,  8.1917e-01, -9

In [48]:
 y = nn.RNN(50 , 64)

In [49]:
y(a) # tuple ke andar 2D tensor 

(tensor([[ 0.4502, -0.4295,  0.3240, -0.4273, -0.3014, -0.5953, -0.1974,  0.1112,
           0.6696, -0.3774, -0.1458,  0.0313, -0.5461,  0.3412,  0.1190, -0.6218,
          -0.6943,  0.3040, -0.4488, -0.3985, -0.4769, -0.0148,  0.0476,  0.0398,
           0.1206,  0.0245,  0.2245, -0.5463, -0.5081, -0.7515,  0.6875, -0.2168,
          -0.4771, -0.3542,  0.2023,  0.3320,  0.6957,  0.3494,  0.7697,  0.4478,
          -0.2752, -0.3144, -0.4444,  0.3991,  0.1023, -0.0470,  0.4776,  0.4911,
           0.0946, -0.4103, -0.3804,  0.0035,  0.3420,  0.7257,  0.2220,  0.0363,
          -0.5749, -0.1331, -0.2146,  0.0432, -0.3537, -0.1544, -0.8328, -0.0790],
         [-0.5564,  0.2084, -0.7508, -0.2657,  0.2320, -0.7379, -0.1366,  0.7019,
          -0.3935,  0.6444,  0.4816,  0.4393, -0.1858,  0.2224,  0.0222, -0.0561,
           0.0157,  0.4280, -0.0966, -0.5466,  0.5270, -0.3998, -0.1918,  0.2335,
          -0.4434,  0.4260, -0.2151, -0.3599,  0.1738, -0.5136, -0.0836, -0.5972,
          -0.06

In [50]:
y(a)[0].shape  # hidden state output

torch.Size([6, 64])

In [51]:
b = y(a)[1].shape  #final state output stored in b

torch.Size([1, 64])

In [57]:
learning_rate = 0.001
epochs = 20

In [58]:
model = SimpleRNN(len(vocab))

In [60]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [62]:
# training loop

for epoch in range(epochs):  # 20 times the loop will get executed

  total_loss = 0  # for measuring loss

  for question, answer in dataloader:  

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 10.719632
Epoch: 2, Loss: 9.530342
Epoch: 3, Loss: 8.322802
Epoch: 4, Loss: 7.495620
Epoch: 5, Loss: 6.774483
Epoch: 6, Loss: 6.005592
Epoch: 7, Loss: 5.424946
Epoch: 8, Loss: 4.887863
Epoch: 9, Loss: 4.451186
Epoch: 10, Loss: 4.067106
Epoch: 11, Loss: 3.691444
Epoch: 12, Loss: 3.418797
Epoch: 13, Loss: 3.140523
Epoch: 14, Loss: 2.897867
Epoch: 15, Loss: 2.681110
Epoch: 16, Loss: 2.478679
Epoch: 17, Loss: 2.299879
Epoch: 18, Loss: 2.142529
Epoch: 19, Loss: 1.998504
Epoch: 20, Loss: 1.864453


In [63]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [70]:
predict(model, "Who painted the Mona Lisa?")

leonardo-da-vinci


In [71]:
list(vocab.keys())[7]

'paris'