In [248]:
import pandas as pd
df = pd.read_csv("100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


# Converting our data set into Numbers

In [249]:
# Basic tokenizer
def tokenize(text):
    text = text.lower()
    text = text.replace('?', '')
    text = text.replace("'", "")
    return text.split()

In [250]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

vocab is a dictionary.

<UNK> means “unknown word”.

Index 0 is reserved for unknown words

In [251]:
#Vocabulory - staring mein empty hai
vocab = {'<UNK>':0}

Take question words -- Take answer words --- Combine them

For every word: if it’s new, assign a new number

Example: <UNK> already has 0

first new word maybe "what" gets 1 ---- next "is" gets 2 ------- "the" gets 3 …etc

In [291]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer =tokenize(row['answer'])
    # print(tokenized_question, tokenized_answer)

    # merge question + answer into one vector
    merged_tokens = tokenized_question + tokenized_answer
    print(merged_tokens)

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [292]:
df.apply(build_vocab, axis=1)

['what', 'is', 'the', 'capital', 'of', 'france', 'paris']
['what', 'is', 'the', 'capital', 'of', 'germany', 'berlin']
['who', 'wrote', 'to', 'kill', 'a', 'mockingbird', 'harper-lee']
['what', 'is', 'the', 'largest', 'planet', 'in', 'our', 'solar', 'system', 'jupiter']
['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius', '100']
['who', 'painted', 'the', 'mona', 'lisa', 'leonardo-da-vinci']
['what', 'is', 'the', 'square', 'root', 'of', '64', '8']
['what', 'is', 'the', 'chemical', 'symbol', 'for', 'gold', 'au']
['which', 'year', 'did', 'world', 'war', 'ii', 'end', '1945']
['what', 'is', 'the', 'longest', 'river', 'in', 'the', 'world', 'nile']
['what', 'is', 'the', 'capital', 'of', 'japan', 'tokyo']
['who', 'developed', 'the', 'theory', 'of', 'relativity', 'albert-einstein']
['what', 'is', 'the', 'freezing', 'point', 'of', 'water', 'in', 'fahrenheit', '32']
['which', 'planet', 'is', 'known', 'as', 'the', 'red', 'planet', 'mars']
['who', 'is', 'the', 'author', 'of', '19

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [297]:
len(vocab)

324

In [301]:
# now assign that vocab to index 
def text_to_indices(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [302]:
text_to_indices("What is campux rajia is my friend", vocab)

[1, 2, 0, 0, 2, 0, 0]

In [303]:
text_to_indices("'what is the capital of australia canberra", vocab)

[1, 2, 3, 4, 5, 99, 100]

In [261]:
import torch 
from torch.utils.data import Dataset, DataLoader

In [262]:
class QADataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
      numerical_question  = text_to_indices(self.df.iloc[index]['question'], self.vocab)
      numerical_answer  = text_to_indices(self.df.iloc[index]['answer'], self.vocab)
       
     # converting into tensor
      return torch.tensor(numerical_question), torch.tensor( numerical_answer)

In [263]:
dataset = QADataset(df, vocab)

In [264]:
for i in range(len(dataset)):
    print(dataset[i])

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))
(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))
(tensor([10, 11, 12, 13, 14, 15]), tensor([16]))
(tensor([ 1,  2,  3, 17, 18, 19, 20, 21, 22]), tensor([23]))
(tensor([ 1,  2,  3, 24, 25,  5, 26, 19, 27]), tensor([28]))
(tensor([10, 29,  3, 30, 31]), tensor([32]))
(tensor([ 1,  2,  3, 33, 34,  5, 35]), tensor([36]))
(tensor([ 1,  2,  3, 37, 38, 39, 40]), tensor([41]))
(tensor([42, 43, 44, 45, 46, 47, 48]), tensor([49]))
(tensor([ 1,  2,  3, 50, 51, 19,  3, 45]), tensor([52]))
(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))
(tensor([10, 55,  3, 56,  5, 57]), tensor([58]))
(tensor([ 1,  2,  3, 59, 25,  5, 26, 19, 60]), tensor([61]))
(tensor([42, 18,  2, 62, 63,  3, 64, 18]), tensor([65]))
(tensor([10,  2,  3, 66,  5, 67]), tensor([68]))
(tensor([ 1,  2,  3, 69,  5,  3, 70, 71]), tensor([72]))
(tensor([ 1,  2,  3,  4,  5, 73]), tensor([74]))
(tensor([10, 75, 76]), tensor([77]))
(tensor([78, 79, 80, 81, 82, 83, 84]), tensor([85]))
(tensor([42, 86, 87, 88

In [265]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [266]:
for question, answer in dataloader:
    # print(question[0], answer[0])

    # Humare dataloasder mein jp answer hai wo 2d vector hai - but it should be in 1d vector. -- print(question, answer)
    print(question, answer[0])

tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([121])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([268])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([154])
tensor([[ 42, 255,   2, 256,  83, 257, 258]]) tensor([259])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([7])
tensor([[  1,   2,   3, 234,   5, 235]]) tensor([131])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([85])
tensor([[ 10,  96,   3, 104, 239]]) tensor([240])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([54])
tensor([[ 10, 308,   3, 309, 310]]) tensor([311])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([166])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([113])
tensor([[ 78,  79, 261, 151,  14, 262, 153]]) tensor([36])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([246])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([317])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]]) tensor([173])
tensor([[ 42, 137, 1

# RNN - Architecture

In [267]:
import torch.nn as nn
import torch.optim as optim

In [268]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size):

       super().__init__()
       self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
       self.rnn = nn.RNN(50, 64, batch_first=True)
       self.fc = nn.Linear(64, vocab_size)

    def forward(self,question):
        embedded_question = self.embedding(question)
        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))
        return output

In [269]:
x = nn.Embedding(324, embedding_dim=50)

In [270]:
a = x(dataset[0][0])

In [271]:
a

tensor([[ 1.4620e+00,  1.9749e-01,  1.5101e+00,  6.3097e-02,  1.3513e+00,
          2.3065e+00, -1.1935e+00, -6.5576e-01, -3.6879e-01, -3.4551e-02,
         -2.0489e-01, -4.8504e-01, -1.3403e+00,  1.5944e+00,  5.2382e-01,
          5.1665e-01,  1.4921e+00,  1.4194e+00,  1.0114e+00, -9.3792e-01,
         -2.2628e+00,  1.1561e+00, -5.9504e-01,  1.7646e+00, -1.5969e+00,
         -6.1401e-01,  1.1085e+00,  2.8678e-01, -1.0663e+00,  3.9252e-01,
         -1.8657e+00, -1.2627e+00,  1.3352e+00,  1.5979e+00, -5.4800e-01,
         -4.1947e-01, -1.8943e+00,  8.1749e-01, -1.7506e-01,  4.9213e-01,
         -1.0683e+00,  1.3749e+00, -9.3097e-01,  1.4183e-01, -6.1944e-01,
         -1.0037e+00,  6.0973e-01,  7.5424e-01, -1.0513e+00, -7.2630e-01],
        [-1.7000e+00, -7.0641e-01, -1.2578e+00, -4.8259e-01,  1.0410e+00,
         -8.2163e-02, -2.2141e-01, -4.2804e-01,  4.1868e-01,  4.2446e-01,
         -5.6893e-01,  6.0598e-01,  4.1341e-01, -9.4245e-01, -7.2824e-01,
         -9.2631e-01, -2.1766e-01, -1

In [272]:
y = nn.RNN(50, 64)

In [273]:
y(a)

(tensor([[ 0.6748, -0.0978,  0.7745, -0.7282, -0.2032, -0.4205,  0.6376,  0.4010,
           0.7763,  0.0304, -0.0314, -0.5680,  0.1892,  0.6928,  0.7287,  0.4384,
          -0.1925,  0.4610,  0.1900, -0.1199,  0.7262,  0.8446, -0.1629, -0.2603,
           0.1100, -0.0192, -0.4366,  0.8797,  0.7424,  0.6088, -0.5715,  0.0332,
           0.6566,  0.1976, -0.1022,  0.7170,  0.3550,  0.2337, -0.1382, -0.2252,
           0.4143, -0.5490,  0.3431, -0.6302, -0.4066, -0.3680, -0.3326, -0.1669,
           0.3796, -0.4306,  0.4659,  0.4090, -0.1525,  0.2867,  0.6836,  0.0056,
           0.0965,  0.0213,  0.1343, -0.0308,  0.6674,  0.2255, -0.0410, -0.6681],
         [ 0.2422, -0.0967, -0.6369,  0.3543, -0.5496,  0.0021, -0.1403,  0.2546,
           0.0281, -0.1655,  0.4442, -0.3394,  0.2801,  0.4661, -0.0466, -0.3867,
          -0.7884, -0.2370, -0.3057, -0.1617, -0.5798,  0.2397,  0.3771,  0.1724,
           0.4158,  0.5562,  0.1317, -0.2974,  0.7072,  0.0274, -0.0723, -0.4747,
          -0.10

In [274]:
b = y(a)[0]

In [275]:
z = nn.Linear(64, 324)

In [276]:
z(b)

tensor([[ 0.1470, -0.3861,  0.1322,  ...,  0.5383, -0.3491, -0.0231],
        [-0.2705,  0.1819,  0.1271,  ...,  0.1125,  0.0487, -0.2165],
        [-0.5856,  0.1106,  0.0258,  ..., -0.2900, -0.1779,  0.5624],
        [-0.1366, -0.4581,  0.1893,  ...,  0.2446, -0.3306,  0.1017],
        [ 0.5694, -0.3378,  0.0104,  ...,  0.0870,  0.4658,  0.2102],
        [-0.3030, -0.5053, -0.1261,  ...,  0.2058,  0.1786, -0.0897]],
       grad_fn=<AddmmBackward0>)

In [277]:
z(b).shape

torch.Size([6, 324])

In [278]:
learning_rate = 0.001
epochs = 60


In [279]:
model = SimpleRNN(len(vocab))

In [280]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop

In [281]:
for epoch in range (epochs):
  
  total_loss = 0

  for question , answer in dataloader:

    optimizer.zero_grad()

    # forward pss
    output = model(question)
    # print(output.shape)
    # loss => output shape(1, 324) -- anser(1)
    loss = criterion(output, answer[0])

    # Gradients
    loss.backward()

    # Update gradiednt
    optimizer.step()

    total_loss = total_loss + loss.item()

print(f"Epoch : {epoch+1}, Loss: {total_loss:4f}")

    

Epoch : 60, Loss: 0.542569


In [282]:
def predict(model, question, threshold=0.5):

    # convert question to numbers
    numerical_question = text_to_indices(question, vocab)

    # convert to tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)

    # send to model
    output = model(question_tensor)

    #convert logits  to probability
    prob = torch.nn.functional.softmax(output, dim=1)
    #print(prob)

    #print(question_tensor.shape)
    

    # find the index of max vocab
    value , index = torch.max(prob, dim=1)
    if value < threshold:
        print("I do not know")
    print(list(vocab.keys())[index])

    

In [283]:
predict(model, "What is the capital of Germany ")

berlin


In [284]:
output.shape

torch.Size([1, 324])