In [2]:
import pandas as pd
df = pd.read_csv("100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


# Converting our data set into Numbers

In [3]:
# Basic tokenizer
def tokenize(text):
    text = text.lower()
    text = text.replace('?', '')
    text = text.replace("'", "")
    return text.split()

In [4]:
tokenize("What is the capital of France?")

['what', 'is', 'the', 'capital', 'of', 'france']

vocab is a dictionary.

<UNK> means “unknown word”.

Index 0 is reserved for unknown words

In [5]:
#Vocabulory
vocab = {'<UNK>': 0}

Take question words -- Take answer words --- Combine them

For every word: if it’s new, assign a new number

Example:

<UNK> already has 0

first new word maybe "what" gets 1 ---- next "is" gets 2 ------- "the" gets 3 …etc

In [6]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer =  tokenize(row['answer'])

    # merged token
    merged_tokens = tokenized_question + tokenized_answer

    # print(tokenized_question, tokenized_answer)
    #print(merged_tokens)

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
            

In [7]:
vocab

{'<UNK>': 0}

In [8]:
df.apply(build_vocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [9]:
len(vocab)

324

In [10]:
# Convert words into numberical indices

def text_to_indices(text, vocab):
    # created new variable
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [11]:
text_to_indices("What is campux rajia is my friend", vocab)

[1, 2, 0, 0, 2, 0, 0]

In [12]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [13]:
len(vocab)

324

In [14]:
import torch 
from torch.utils.data import Dataset, DataLoader

In [15]:
class QADataset(Dataset):
    
    def __init__(self, df, vocab):
      self.df = df
      self.vocab = vocab
        
    def __len__(self):
      return self.df.shape[0]
    
    def __getitem__(self,index):
      numerical_question  = text_to_indices(self.df.iloc[index]['question'], self.vocab)
      numerical_answer  = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

     # converting into tensor
      return torch.tensor(numerical_question), torch.tensor( numerical_answer)

In [16]:
dataset = QADataset(df, vocab)


In [17]:
for i in range(len(dataset)):
    print(dataset[i])

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))
(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))
(tensor([10, 11, 12, 13, 14, 15]), tensor([16]))
(tensor([ 1,  2,  3, 17, 18, 19, 20, 21, 22]), tensor([23]))
(tensor([ 1,  2,  3, 24, 25,  5, 26, 19, 27]), tensor([28]))
(tensor([10, 29,  3, 30, 31]), tensor([32]))
(tensor([ 1,  2,  3, 33, 34,  5, 35]), tensor([36]))
(tensor([ 1,  2,  3, 37, 38, 39, 40]), tensor([41]))
(tensor([42, 43, 44, 45, 46, 47, 48]), tensor([49]))
(tensor([ 1,  2,  3, 50, 51, 19,  3, 45]), tensor([52]))
(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))
(tensor([10, 55,  3, 56,  5, 57]), tensor([58]))
(tensor([ 1,  2,  3, 59, 25,  5, 26, 19, 60]), tensor([61]))
(tensor([42, 18,  2, 62, 63,  3, 64, 18]), tensor([65]))
(tensor([10,  2,  3, 66,  5, 67]), tensor([68]))
(tensor([ 1,  2,  3, 69,  5,  3, 70, 71]), tensor([72]))
(tensor([ 1,  2,  3,  4,  5, 73]), tensor([74]))
(tensor([10, 75, 76]), tensor([77]))
(tensor([78, 79, 80, 81, 82, 83, 84]), tensor([85]))
(tensor([42, 86, 87, 88

In [18]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [19]:
for question, answer in dataloader:
    # print(question[0], answer[0])

    # Humare dataloasder mein jp answer hai wo 2d vector hai - but it should be in 1d vector. -- print(question, answer)
    print(question, answer[0])

tensor([[ 10,  96,   3, 104, 239]]) tensor([240])
tensor([[ 10,  75, 208]]) tensor([209])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([273])
tensor([[  1,   2,   3, 221,   5, 222, 223, 224]]) tensor([225])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([41])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([295])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([53])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([91])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([95])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([36])
tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([121])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([184])
tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[ 42, 107,   2, 108,  19, 109]]) tensor([110])
tensor([[ 10, 308,   3, 309, 310]]) tensor([311])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 

# RNN - Architecture

In [20]:
import torch.nn as nn
import torch.optim as optim

In [21]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size):

       super().__init__()
       self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
       self.rnn = nn.RNN(50, 64, batch_first=True)
       self.fc = nn.Linear(64, vocab_size)

    def forward(self,question):
        embedded_question = self.embedding(question)
        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))
        return output

In [22]:
x = nn.Embedding(324, embedding_dim=50)

In [23]:
a = x(dataset[0][0])

In [24]:
a

tensor([[-2.1377e+00, -2.3164e-01, -8.8577e-01, -8.6302e-01,  2.2780e+00,
          2.7805e+00, -8.3664e-02, -5.8515e-01,  1.1412e+00, -2.2256e-01,
          7.1594e-01,  1.0472e+00,  1.4826e+00, -2.7891e-03,  5.0627e-01,
         -6.8261e-01, -1.3826e+00,  2.0083e-04, -9.4972e-01, -1.7567e+00,
         -1.5990e-01,  2.2789e-01, -6.1776e-01,  5.8863e-02,  3.9875e-02,
         -3.3638e-01, -5.3282e-01, -1.2845e+00,  9.0589e-01,  1.7616e+00,
          1.6406e+00, -1.4741e-01, -1.7932e+00,  4.6426e-01, -8.9609e-01,
         -1.0795e+00,  3.9572e-01,  2.2667e+00, -2.7296e+00,  2.9900e-01,
         -3.8807e-01, -5.1904e-01, -6.9885e-01,  8.3333e-01, -1.1020e+00,
          9.0552e-01,  3.5715e-01, -2.6679e-01, -4.2496e-01, -3.5951e-01],
        [-2.5776e+00, -6.3284e-02, -4.0788e-01, -9.7818e-01,  1.2438e+00,
         -1.9443e+00, -1.9628e+00,  4.1606e-01, -7.0972e-01,  6.6521e-01,
         -7.2000e-01, -8.8278e-01, -7.0644e-02, -9.0595e-01,  2.8802e-01,
          1.8994e+00, -1.5065e+00,  7

In [25]:
y = nn.RNN(50, 64)

In [26]:
y(a)

(tensor([[ 0.1386, -0.2766,  0.1318,  0.2453,  0.7182,  0.8117,  0.3529, -0.4893,
           0.2263, -0.5002, -0.2636,  0.4373,  0.1393,  0.2294,  0.4198,  0.4396,
           0.2102, -0.0686, -0.0739,  0.8098,  0.2750,  0.1615, -0.2914, -0.1062,
           0.4976,  0.7097,  0.5979, -0.4809, -0.6143, -0.1559,  0.0713, -0.8890,
           0.7270, -0.5106,  0.1279, -0.0895, -0.8361,  0.0243,  0.2923,  0.2762,
           0.0418,  0.1982, -0.3737, -0.8336, -0.2464, -0.3864,  0.1566, -0.7609,
           0.7502,  0.5404, -0.8228,  0.6680, -0.7115, -0.0777, -0.7044,  0.3961,
           0.6666, -0.2729,  0.0625,  0.4266, -0.7011,  0.4458,  0.5942, -0.4845],
         [ 0.6006, -0.5777,  0.2199, -0.7692,  0.3685,  0.0424,  0.4157, -0.0308,
           0.7157, -0.8236,  0.1730, -0.0174, -0.3709, -0.1975, -0.3046,  0.5046,
          -0.1960, -0.7509, -0.3142, -0.2283, -0.3182,  0.1742, -0.5549, -0.8890,
           0.1665, -0.7583,  0.6946, -0.5918,  0.5680,  0.2320,  0.1225, -0.0395,
           0.33

In [27]:
b = y(a)[0]

In [28]:
z = nn.Linear(64, 324)

In [29]:
z(b)

tensor([[-0.1146,  0.1744,  0.3651,  ...,  0.3316,  0.4422,  0.5981],
        [ 0.1557, -0.2842,  0.3395,  ..., -0.1038, -0.1145, -0.2139],
        [ 0.3209,  0.0754, -0.1577,  ...,  0.4042, -0.2524,  0.0015],
        [ 0.2569,  0.0050,  0.0618,  ..., -0.2015,  0.1246,  0.2310],
        [-0.0104,  0.2626,  0.3486,  ...,  0.2806, -0.0347, -0.4466],
        [-0.1433,  0.0167, -0.2590,  ...,  0.0577, -0.0924,  0.1097]],
       grad_fn=<AddmmBackward0>)

In [30]:
z(b).shape

torch.Size([6, 324])

In [31]:
learning_rate = 0.001
epochs = 20


In [32]:
model = SimpleRNN(len(vocab))

In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop

In [34]:
for epoch in range (epochs):
  
  total_loss = 0

  for question , answer in dataloader:

    optimizer.zero_grad()

    # forward pss
    output = model(question)
    print(output.shape)
    # loss => output shape(1, 324) -- anser(1)
    loss = criterion(output, answer[0])

    # Gradients
    loss.backward()

    # Update gradiednt
    optimizer.step()

    total_loss = total_loss + loss.item()

print(f"Epoch : {epoch+1}, Loss: {total_loss:4f}")

    

torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1, 324])
torch.Size([1

In [35]:
def predict(model, question, threshold=0.5):

    # convert question to numbers
    numerical_question = text_to_indices(question, vocab)

    # convert to tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)

    # send to model
    output = model(question_tensor)

    #convert logits  to probability
    prob = torch.nn.functional.softmax(output, dim=1)
    #print(prob)

    #print(question_tensor.shape)
    

    # find the index of max vocab
    value , index = torch.max(prob, dim=1)
    if value < threshold:
        print("I do not know")
    print(list(vocab.keys())[index])

    

In [36]:
predict(model, "Who is  Rajia")

I do not know
armstrong


In [37]:
output.shape

torch.Size([1, 324])