In [1]:
import pandas as pd

df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")

df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [2]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace("?","")
  text = text.replace("'","")
  return text.split()

In [3]:
tokenize("What is the boiling point of water in Celsius?")

['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius']

In [4]:
# create vocabulary of unique words

vocab = {'<UNK>':0}

def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])
  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)


In [5]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [6]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [7]:
len(vocab)

324

In [8]:
# convert words into numerical indices

def text_to_indices(text,vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text




In [9]:
text_to_indices("what is my name?",vocab)

[1, 2, 0, 0]

In [10]:
# load the libary
import torch
from torch.utils.data import Dataset,DataLoader


In [11]:
class QADataset(Dataset):

  def __init__(self,df,vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'],self.vocab)

    return torch.tensor(numerical_question),torch.tensor(numerical_answer)


In [12]:
dataset = QADataset(df,vocab)

In [13]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [14]:
dataloader = DataLoader(dataset,batch_size=1, shuffle=True)

In [15]:
for question,answer in dataloader:
  print(question,answer)
print(question.dtype)

tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([[128]])
tensor([[ 10, 140,   3, 141, 142,  12, 143,  83,   3, 144]]) tensor([[145]])
tensor([[ 42, 167,   2,   3,  17, 168, 169]]) tensor([[170]])
tensor([[  1,   2,   3,  37, 133,   5,  26]]) tensor([[134]])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([[41]])
tensor([[ 42, 174,   2,  62,  39, 175, 176,  12, 177, 178]]) tensor([[179]])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([[295]])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([[106]])
tensor([[10, 29,  3, 30, 31]]) tensor([[32]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[260]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([[316]])
tensor([[ 10,  11, 157, 158, 159]]) tensor([[160]])
tensor([[ 42, 250, 251, 118, 252, 253]]) tensor([[254]])
tensor([[  1,   2,   3,   4,   5, 206]]) tensor([[207]])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[  1, 

In [16]:
import torch.nn as nn

In [35]:
class SimpleRNN(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn = nn.RNN(50,64,batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self,question):
    embedded_question = self.embedding(question)
    hidden,final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output




In [36]:
learning_rate = 0.001
epochs = 20

In [37]:
model = SimpleRNN(len(vocab))

In [38]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [39]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question,answer in dataloader:
    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss
    loss = criterion(output,answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch : {epoch+1} Loss: {total_loss:4f}")

Epoch : 1 Loss: 527.021381
Epoch : 2 Loss: 455.630792
Epoch : 3 Loss: 375.358722
Epoch : 4 Loss: 313.652239
Epoch : 5 Loss: 261.495797
Epoch : 6 Loss: 211.871649
Epoch : 7 Loss: 168.846631
Epoch : 8 Loss: 131.131964
Epoch : 9 Loss: 101.332696
Epoch : 10 Loss: 77.565926
Epoch : 11 Loss: 59.935839
Epoch : 12 Loss: 46.751218
Epoch : 13 Loss: 37.331668
Epoch : 14 Loss: 30.348253
Epoch : 15 Loss: 24.935162
Epoch : 16 Loss: 21.037449
Epoch : 17 Loss: 17.801924
Epoch : 18 Loss: 15.275354
Epoch : 19 Loss: 13.078550
Epoch : 20 Loss: 11.339377


In [63]:
def predict(model,question,threshold=0.5):

  # convert question into numerical indices
  numerical_question = text_to_indices(question,vocab)

  # convert numerical indices into tensor
  tensor_question = torch.tensor(numerical_question).unsqueeze(0) # to make it (batchsize,dim)

  # send to model
  output = model(tensor_question)

  # convert logit into probabilitities
  probs = torch.nn.functional.softmax(output,dim=1)

  # find index of max prob
  value,index = torch.max(probs,dim=1)

  if value < threshold:
    print("I don't know.")
  else:
    print(list(vocab.keys())[index])




In [68]:
predict(model,"what is the capital city of Germany?")


berlin


## for underneath understanding


In [17]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [18]:
dataset[0][0].dtype

torch.int64

In [19]:
x= nn.Embedding(324,embedding_dim=50)

In [20]:
x(dataset[0][0])

tensor([[ 1.0227e+00,  5.8167e-01, -1.5518e-01,  1.0700e+00,  1.5804e+00,
          1.1146e+00, -1.3937e+00,  1.2640e+00,  1.8745e+00, -3.2424e-01,
         -7.4317e-01, -1.5174e+00,  1.2204e+00,  1.3354e+00,  7.0115e-02,
          5.3638e-01,  1.1693e+00,  1.7551e+00, -2.7504e-01, -7.6048e-01,
         -5.8774e-01,  1.4547e+00,  5.2815e-01, -5.7867e-01,  9.7347e-02,
          1.3665e-01,  6.7627e-01, -1.0797e+00,  1.9258e+00,  1.7303e-01,
         -9.9373e-02,  6.2178e-01, -3.1118e-01,  7.1450e-02, -2.6328e-01,
         -4.0624e-01, -6.0257e-01,  2.3777e+00,  8.4220e-01,  6.8520e-03,
         -5.2696e-01, -3.7836e-01,  4.5372e-01, -1.7085e-01,  4.3422e-01,
          7.1812e-01, -7.5967e-01, -1.8148e-01,  8.0685e-01, -7.5485e-01],
        [-1.6511e+00,  6.9044e-01, -1.3407e+00,  1.5776e+00,  5.6589e-01,
          3.5308e-01, -6.4965e-02,  6.7706e-01, -1.9979e-01,  1.2708e-01,
         -1.2504e+00,  1.1434e+00,  2.1973e+00,  8.6476e-01, -5.8159e-01,
         -4.8683e-01,  9.3527e-01,  1

In [21]:
x(dataset[0][0]).dtype

torch.float32

In [22]:
x(dataset[0][0]).shape

torch.Size([6, 50])

In [23]:
a = x(dataset[0][0])

In [24]:
y = nn.RNN(50,64)

In [25]:
y(a)

(tensor([[-2.9986e-01,  2.8012e-01,  5.4653e-02,  4.6459e-01,  3.4977e-01,
           3.2933e-02, -3.9721e-01, -3.7181e-02, -2.7648e-01, -7.4611e-01,
          -6.2870e-01,  6.9327e-01, -2.4400e-03, -2.9774e-01, -8.9080e-02,
           1.4464e-01, -8.2299e-01, -4.7901e-01,  2.6214e-01, -2.0204e-01,
          -3.2281e-02, -2.7066e-01,  1.4277e-01, -5.1513e-01,  5.3059e-01,
           6.1029e-01,  1.8763e-01,  9.2579e-02,  4.9404e-01, -3.9382e-01,
           7.4287e-01,  4.1594e-01,  4.8616e-01,  1.9959e-01, -1.3971e-03,
          -4.1455e-01, -2.4286e-01, -1.6374e-01, -3.8591e-01, -5.5040e-01,
           7.0835e-01,  6.4380e-01, -3.8123e-01, -1.1455e-02, -3.2625e-01,
           5.8380e-01, -2.0497e-01,  2.5512e-01,  5.3148e-01, -8.1370e-02,
           4.6605e-01, -5.9193e-01, -4.5505e-01,  7.0953e-01, -8.0320e-01,
           8.3807e-01, -6.7895e-01, -1.7854e-02,  6.6715e-01,  2.4510e-01,
          -5.7820e-02,  6.4231e-01, -4.1449e-02,  4.7408e-01],
         [-1.7760e-01, -7.3312e-01, -

In [26]:
hidden_layer = y(a)[0].shape
hidden_layer

torch.Size([6, 64])

In [27]:
y(a)[0].dtype

torch.float32

In [28]:

output_layer = y(a)[1].shape
output_layer

torch.Size([1, 64])

In [29]:
# final output
b= y(a)[1]

In [30]:
y(a)[1].dtype

torch.float32

In [31]:
z = nn.Linear(64, 324)

In [32]:
z(b)

tensor([[-0.0785,  0.2492,  0.3124, -0.0185, -0.2593,  0.4270,  0.0530, -0.1348,
         -0.1274,  0.0412,  0.0343,  0.0168, -0.1478, -0.0023,  0.2606, -0.5019,
         -0.0326, -0.0477,  0.0409, -0.3552,  0.3978, -0.2420,  0.0414,  0.2874,
          0.4015, -0.1583,  0.0491,  0.5096,  0.0619,  0.0607, -0.0291, -0.0541,
         -0.1620,  0.3719,  0.0065, -0.1140,  0.1641,  0.1338,  0.4676,  0.4500,
          0.6068,  0.1663,  0.0343, -0.2084, -0.1477,  0.0315, -0.2391, -0.2228,
         -0.0645,  0.3941, -0.1124,  0.0438,  0.3537,  0.3052, -0.0517,  0.0258,
         -0.0252, -0.0081, -0.1417,  0.2815,  0.3740, -0.1534, -0.0287, -0.2183,
         -0.2070, -0.7825,  0.1407,  0.2414,  0.0582, -0.2169,  0.5208,  0.4127,
          0.0412, -0.5791, -0.0385,  0.4260, -0.2290,  0.0545,  0.0388,  0.0045,
          0.1172, -0.0688,  0.2020,  0.5101, -0.0015,  0.0686, -0.0532, -0.3311,
          0.0840,  0.1906, -0.1158, -0.0408,  0.3688,  0.0138, -0.3350,  0.1260,
          0.1210, -0.1397,  

In [33]:
z(b).shape

torch.Size([1, 324])

In [34]:
z(b).dtype

torch.float32