<a href="https://colab.research.google.com/github/Shaileshps21/pytorch-/blob/main/pytorch_10_rnn_qa_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset , DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('100_Unique_QA_Dataset.csv')
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [4]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace("?" , "")
  text = text.replace("'", "")
  return text.split()

In [5]:
tokenize('what is the capital of france?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [6]:
# vocabulary
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer   = tokenize(row['answer'])

    merged_tokens = tokenized_question + tokenized_answer
    print(merged_tokens)

    for token in merged_tokens:
      if token not in vocab:
        vocab[token] = len(vocab)

In [7]:
vocab = {'<UNK>': 0}
df.apply(build_vocab , axis = 1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [8]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [9]:
# convert word to numerical indices -- why i am doing it?

def text_to_indices(text, vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [10]:
text_to_indices("What is France" , vocab)

[1, 2, 6]

In [11]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [12]:
class QADataset(Dataset):
  def __init__(self , df , vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'] , self.vocab)
    numerical_answers  = text_to_indices(self.df.iloc[index]['answer']   , self.vocab)

    return torch.tensor(numerical_question) ,torch.tensor(numerical_answers)

In [13]:
dataset = QADataset(df, vocab)

In [14]:
dataloader = DataLoader(dataset, batch_size=1 , shuffle=True)

In [15]:
for question, answer in dataloader:
  print(question , answer)

tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([[131]])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([[52]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([[316]])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([[134]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[ 10,  11, 189, 158, 190]]) tensor([[191]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[10, 96,  3, 97]]) tensor([[98]])
tensor([[ 10, 308,   3, 309, 310]]) tensor([[311]])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([[95]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[ 10,  96,   3, 104, 239]]) tensor([[240]])
tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([[121]])
tensor([[ 78,  79, 288,  81,  19,  14, 289]]) tensor([[85]])
tensor([[ 42, 107,   2, 108,  19, 109]]) tensor([[110]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
t

In [16]:
# simple rnn architecture
class simpleRNN(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True) # Added batch_first=True
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    output, hidden = self.rnn(embedded_question)
    return self.fc(output[:, -1, :])

In [17]:
# demonstration of the first layer , ie embedding layer
print(dataset[0])
print(dataset[0][0])
print(dataset[0][1])

x = nn.Embedding(324, embedding_dim = 50)
a = x(dataset[0][0])

print(a)

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))
tensor([1, 2, 3, 4, 5, 6])
tensor([7])
tensor([[ 1.9510,  0.3835, -0.6742,  1.0170,  0.3757, -0.0587,  0.7233, -0.0578,
          0.4467, -0.4602,  1.4304, -1.1047,  0.7228,  0.0426, -0.8533,  0.0822,
          1.8872, -0.3111, -0.1372, -0.9485, -1.2271, -1.8420, -1.4401,  0.2846,
         -1.3248,  1.9686, -0.7289,  1.3958,  0.0195,  0.8718, -1.6931,  0.0696,
          1.6488, -0.1942,  0.9614, -0.4591,  1.3682, -0.8834,  0.9048,  1.0803,
         -0.8558,  0.5434,  0.7124,  0.6957,  0.5596, -0.2251, -1.4040, -0.8118,
          1.1291,  1.3819],
        [ 1.2057,  0.7242,  0.4766,  1.8232, -0.1568, -0.6864, -0.3358,  1.4507,
         -0.2730, -0.7884, -0.2710, -0.6521, -0.9552,  0.4701,  1.9696, -0.0051,
          0.9150,  0.7413,  0.9815,  0.7858, -0.8250,  1.1154, -0.1230,  0.2708,
         -0.0479, -1.0042, -1.3345,  2.1161, -0.1875,  0.1872, -0.4203, -0.7981,
         -1.0904,  0.0945, -0.0390,  0.5773, -0.1296,  1.0336, -1.3871, -1.2780,


In [18]:
# demonstration of the second layer ,ie rnn layer
y = nn.RNN(50 ,64)
# print(y)
print(y(a))
print("-----------------------------------------------------------------------")
print(y(a)[0]) #thsi prints the shape of the output tensor
print("-----------------------------------------------------------------------")
print(y(a)[1]) #this prints the shape of the hidden tensor

b = y(a)[1]

(tensor([[-0.6204, -0.0203, -0.2887,  0.1660,  0.1322,  0.3559, -0.8063,  0.3843,
         -0.5506, -0.5945, -0.2307,  0.6002,  0.9458, -0.1357,  0.8379, -0.3584,
         -0.4037, -0.1949, -0.5161,  0.0563, -0.1205,  0.1823,  0.4058,  0.5339,
         -0.6748,  0.4014, -0.3009, -0.2334, -0.1077, -0.3784, -0.2661, -0.1139,
          0.4789, -0.1183, -0.6610,  0.2185, -0.6982, -0.1785,  0.7554,  0.2535,
         -0.3437, -0.5492, -0.4923,  0.6360,  0.2342,  0.4381,  0.3909, -0.5424,
         -0.2362, -0.0358,  0.1849, -0.1653,  0.1854, -0.0224,  0.6396, -0.1813,
          0.2941,  0.2918, -0.6228,  0.2380, -0.0316,  0.0291, -0.3072, -0.6183],
        [ 0.7398, -0.0709,  0.0421, -0.4208,  0.3960, -0.0672, -0.5374, -0.3949,
          0.6273,  0.3572,  0.2788,  0.1407,  0.3216, -0.0853,  0.3703, -0.0172,
          0.7241,  0.6509, -0.3759,  0.2308, -0.6632,  0.6344, -0.0020, -0.3236,
          0.5333,  0.0982,  0.2089,  0.7473, -0.3545, -0.2656, -0.0194,  0.1881,
          0.2953, -0.3096,

In [19]:
# demonstration of the third layer, ie, the linear layer
z = nn.Linear(64, 324)
z(b).shape

torch.Size([1, 324])

In [20]:
# this is being useful in case of debugging the code, when the modle fails
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [21]:
# MODEL initialization
learning_rate = 0.001
epochs = 30
criterion = nn.CrossEntropyLoss()

model = simpleRNN(len(vocab))
optimizer = optim.Adam(model.parameters() , lr =learning_rate)

In [22]:
# training loop
for epoch in range(epochs):
  total_loss =0

  for question , answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss
    loss = criterion(output , answer.squeeze(1)) # Corrected target shape by squeezing

    # gradients
    loss.backward() # Corrected from model.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"epochs{epoch+1} , loss{total_loss}")

epochs1 , loss526.6085948944092
epochs2 , loss461.8344326019287
epochs3 , loss385.1834497451782
epochs4 , loss318.1324818134308
epochs5 , loss265.6491975784302
epochs6 , loss217.5273550748825
epochs7 , loss173.4023202061653
epochs8 , loss135.26015710830688
epochs9 , loss104.60088697075844
epochs10 , loss79.9649957716465
epochs11 , loss60.964148074388504
epochs12 , loss47.721647784113884
epochs13 , loss37.636601984500885
epochs14 , loss30.14390578120947
epochs15 , loss24.787516459822655
epochs16 , loss20.47341265529394
epochs17 , loss17.176109820604324
epochs18 , loss14.542703792452812
epochs19 , loss12.534499067813158
epochs20 , loss10.898029550909996
epochs21 , loss9.512839451432228
epochs22 , loss8.454972725361586
epochs23 , loss7.474194344133139
epochs24 , loss6.669783689081669
epochs25 , loss6.035321770235896
epochs26 , loss5.436522055417299
epochs27 , loss4.934913970530033
epochs28 , loss4.506783422082663
epochs29 , loss4.121497560292482
epochs30 , loss3.7738420516252518


In [23]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [24]:
predict(model, "What is the largest in our solar system?")

jupiter


In [25]:
list(vocab.keys())[8]

'germany'