In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [30]:
# Global vocabulary
vocabulary = {"<UNK>": 0}

# Word by Word splitting and some preprocessing
def tokenize(doc: str) -> list:
    doc = doc.lower().replace('?', '').replace("'", '')
    return doc.split()

# Building Vocabulary
def build_vocab(doc: str) -> None:
    for word in doc:
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary)

# Label Encoding each word
def text_to_index(doc: str) -> list:
    encoding = []
    for word in doc:
        if word in vocabulary:
            encoding.append(vocabulary[word])
        else:
            encoding.append(vocabulary["<UNK>"])
    return encoding

In [31]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, df):
        # Questions
        self.questions = df['question'].apply(tokenize)
        self.questions.apply(build_vocab)
        self.questions = self.questions.apply(text_to_index)

        # Answers
        self.answers = df['answer'].apply(tokenize)
        self.answers.apply(build_vocab)
        self.answers = self.answers.apply(text_to_index)

    def __len__(self):
        return self.questions.shape[0]

    def __getitem__(self, index):
        question = torch.tensor(self.questions[index])
        answer = torch.tensor(self.answers[index])
        return question, answer

In [32]:
obj = CustomDataset(df)

In [33]:
len(obj)

90

In [34]:
obj[10]

(tensor([ 1,  2,  3,  4,  5, 43]), tensor([255]))

In [36]:
len(vocabulary)

324

In [37]:
# Dataloader object
dataloader = DataLoader(dataset = obj, batch_size = 1, shuffle = True)

In [38]:
from torch import nn
from torch.nn import Module, Linear, Embedding, RNN

class CustomModel(Module):
    def __init__(self, vocabulary_size):
        super().__init__()
        self.embedding = Embedding(num_embeddings = vocabulary_size, embedding_dim = 50)
        self.rnn = RNN(input_size = 50, hidden_size = 64, num_layers = 1, batch_first = True)
        self.linear = Linear(in_features = 64, out_features = 324)

    def forward(self, X_train):
        x = self.embedding(X_train)
        x = self.rnn(x)[1]
        x = self.linear(x[0])
        return x

In [40]:
# Parameters
learning_rate = 0.01
epochs = 20

# Model
model = CustomModel(len(vocabulary))

# Loss and optimizer
crition = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [41]:
# training loop

for epoch in range(epochs):
    # Training loss
    total_loss = 0

    for question, answer in dataloader:
        optimizer.zero_grad()

        # forward pass
        output = model(question)

        # loss -> output shape (1,324) - (1)
        loss = crition(output, answer[0]) # answer is [[234]] -> we need [234]

        # gradients
        loss.backward()

        # update
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 550.339101
Epoch: 2, Loss: 387.654267
Epoch: 3, Loss: 181.307964
Epoch: 4, Loss: 66.726079
Epoch: 5, Loss: 39.919988
Epoch: 6, Loss: 36.187008
Epoch: 7, Loss: 21.481358
Epoch: 8, Loss: 22.725643
Epoch: 9, Loss: 15.668339
Epoch: 10, Loss: 20.682343
Epoch: 11, Loss: 11.082262
Epoch: 12, Loss: 10.561101
Epoch: 13, Loss: 10.492138
Epoch: 14, Loss: 6.626725
Epoch: 15, Loss: 3.017596
Epoch: 16, Loss: 0.951603
Epoch: 17, Loss: 0.590405
Epoch: 18, Loss: 0.482855
Epoch: 19, Loss: 0.417959
Epoch: 20, Loss: 0.373680


In [55]:
def predict(model, question, threshold=0.5):

    # convert question to numbers
    question = tokenize(question)
    encoded = text_to_index(question)

    # tensor
    question_tensor = torch.tensor(encoded)

    # send to model
    output = model(question_tensor)

    # convert logits to probs
    probs = torch.nn.functional.softmax(output.reshape(1, -1), dim=1)

    # find index of max prob
    value, index = torch.max(probs, dim=1)

    if value < threshold:
        print("I don't know")

    print(list(vocabulary.keys())[index])

predict(model = model, question = "Who invented the light bulb?")

edison


> `Note`: Why we have not clubbed all the individual layers in the sequential model?

In [59]:
x = nn.Embedding(324, embedding_dim = 50)
y = nn.RNN(50, 64) # The shape -> torch.Size([1, 6, 50]) -> The first number (i.e. 1) represents batch_size now
z = nn.Linear(64, 324)

a = obj[0][0].reshape(1, 6) # Length of document
print("shape of a:", a.shape)

b = x(a) # Embedding shape -> [1, 6, 50] - 1 document, 6 tokens, each represented by 50 numbers
print("shape of b:", b.shape)

c, d = y(b) # RNN layer providing the output coming out from each timestpe -> c and d are exactly same
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d[0]) # Linear layer - output -> need (1, 1, 324) dimention
print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 6, 64])
shape of e: torch.Size([6, 324])


> This is because there is an internal swapping of `no.words` and `batch size` in RNN layer

In [62]:
x = nn.Embedding(324, embedding_dim = 50)
y = nn.RNN(50, 64, batch_first=True) # The shape -> torch.Size([1, 6, 50]) -> The first number (i.e. 1) represents batch_size now
z = nn.Linear(64, 324)

a = obj[0][0].reshape(1, 6) # Length of document
print("shape of a:", a.shape)

b = x(a) # Embedding shape -> [1, 6, 50] - 1 document, 6 tokens, each represented by 50 numbers
print("shape of b:", b.shape)

c, d = y(b) # RNN layer providing the output coming out from each timestpe -> c and d are exactly same
print("shape of c:", c.shape)
print("shape of d:", d.shape) # Now the shape of d becomes same as the output shape

e = z(d.squeeze(0)) # Linear layer - output -> need (1, 324) dimention - removing the 0th dimention
print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [None]:
x = Embedding(num_embeddings = 324, embedding_dim = 50)
x(obj[10][0]) # Here each word is represented in 50 dimentional vector

tensor([[-1.0461e+00, -4.6219e-01,  6.6059e-01, -3.3177e-01,  5.2045e-01,
         -5.5609e-01, -1.4675e+00,  2.1445e-01, -9.0545e-01, -1.0786e+00,
          8.1360e-01,  9.0473e-01, -4.4284e-01, -1.5623e+00,  1.0710e+00,
          1.2653e+00,  2.5241e+00,  1.6951e+00, -1.5642e+00,  1.8392e+00,
         -5.0250e-01, -6.3285e-02, -1.1376e+00,  5.1754e-01,  4.6575e-01,
         -2.1461e-02,  6.4183e-01, -3.2768e-01,  4.5768e-01, -1.2816e+00,
         -1.2318e+00, -6.5893e-01,  1.3353e+00,  1.2324e+00,  2.4504e+00,
         -2.5420e-02, -7.0056e-01, -1.0624e+00, -1.1190e+00,  1.4185e+00,
          2.8599e-01, -2.9091e-01, -1.3036e-01, -1.1756e+00, -2.3197e-01,
         -1.6586e-01,  1.2974e+00, -1.0773e+00,  9.9650e-01, -1.3436e+00],
        [-8.2386e-01,  1.3841e-01,  1.6780e+00,  2.2201e+00,  9.5350e-01,
         -1.5310e-01,  1.0092e-01, -5.8397e-01,  1.0043e+00, -1.5778e+00,
          1.5574e+00, -7.7024e-02,  4.8643e-01,  4.9589e-05, -4.1732e-01,
          2.8395e-01,  1.0458e+00, -1

In [None]:
obj[10][0]

tensor([ 1,  2,  3,  4,  5, 43])

In [None]:
rnn = RNN(input_size = 50, hidden_size = 64, num_layers = 1)
rnn(x(obj[10][0]))[0] # This is the output for our current datapoint at all the timesteps

tensor([[-0.1480,  0.1682, -0.3153,  0.8464,  0.2124,  0.6890,  0.5377,  0.3913,
          0.3764,  0.3233,  0.0711, -0.3419,  0.4632, -0.6993, -0.5980, -0.4139,
          0.3966, -0.6877, -0.4740,  0.0493,  0.2442,  0.1008, -0.2329, -0.2301,
          0.1859,  0.7681, -0.2649,  0.5325,  0.7416,  0.5057,  0.6203, -0.0668,
          0.4639, -0.1839, -0.0997,  0.0238, -0.8411, -0.3240,  0.5799, -0.7168,
          0.2370, -0.5877, -0.4104, -0.3649, -0.2777, -0.3271, -0.0061,  0.6702,
         -0.4394, -0.4092, -0.0328,  0.0395,  0.4109,  0.7653, -0.0898,  0.7611,
         -0.4284,  0.3215, -0.7439, -0.1941, -0.1959, -0.6177,  0.5429, -0.1177],
        [-0.3532,  0.1987, -0.1227,  0.7762,  0.3996, -0.2972, -0.1271,  0.2906,
         -0.6666, -0.4046, -0.8322,  0.3710,  0.6560,  0.5631,  0.0033, -0.1289,
         -0.0480, -0.6461, -0.1639, -0.2048, -0.7219,  0.1504, -0.0874,  0.7774,
         -0.6056,  0.4217,  0.1112,  0.7114, -0.3372, -0.0852,  0.5182,  0.6798,
          0.2826, -0.7445, 

In [None]:
rnn(x(obj[10][0]))[1] # This is the output that get after all the unfolding through time performed by rnn layer

tensor([[-0.4779,  0.3596,  0.0401,  0.5537,  0.0101, -0.7158,  0.2944,  0.1263,
         -0.8846, -0.1536,  0.0157, -0.4048, -0.2939, -0.3868,  0.5018,  0.6671,
         -0.0726, -0.3498,  0.4542,  0.8057,  0.2766, -0.1354,  0.1344, -0.3239,
         -0.3390, -0.1868, -0.1251,  0.4409,  0.3459,  0.0890, -0.1926, -0.7374,
         -0.1043,  0.0051, -0.6688,  0.4271,  0.1331, -0.6063,  0.5489,  0.7169,
         -0.5324, -0.1256,  0.6161,  0.0746, -0.1097,  0.8607,  0.6436,  0.0758,
          0.2875,  0.6023,  0.6642, -0.4370, -0.0475, -0.5442, -0.8497, -0.6466,
          0.5463, -0.5586, -0.7424,  0.5772,  0.3330, -0.5952,  0.4884,  0.1198]],
       grad_fn=<SqueezeBackward1>)

In [None]:
l = Linear(in_features = 64, out_features = 324)
l(rnn(x(obj[10][0]))[1])

tensor([[-0.1188, -0.0173, -0.0659,  0.0729, -0.3191,  0.0257, -0.1517, -0.3821,
         -0.4242, -0.0771,  0.3054,  0.5617,  0.0383, -0.2151,  0.3662,  0.1489,
          0.0211, -0.0884, -0.1222, -0.4123, -0.0690,  0.0085, -0.7261, -0.3389,
         -0.2797,  0.1057, -0.2410, -0.0140, -0.0295, -0.4859, -0.1396,  0.3064,
         -0.3079,  0.2448,  0.0621, -0.3337,  0.0326,  0.0070,  0.0747,  0.4105,
         -0.3255,  0.0709,  0.2476, -0.0197,  0.1185, -0.3247,  0.1210, -0.0838,
          0.0635, -0.0466, -0.2170,  0.1870,  0.1054, -0.4091,  0.2169, -0.2377,
         -0.1413, -0.2101, -0.1087,  0.0397,  0.4070,  0.2792,  0.6470, -0.2522,
          0.2529,  0.0482, -0.0805, -0.3199, -0.5706,  0.1963,  0.1735, -0.2893,
         -0.2278,  0.2025,  0.3049, -0.1097,  0.2029,  0.2665,  0.3520, -0.4406,
          0.1413, -0.1050,  0.3179,  0.1808, -0.2725, -0.2645, -0.4522,  0.0594,
         -0.4049,  0.1337, -0.0314, -0.4866,  0.2711, -0.0842,  0.2940,  0.3305,
          0.1066,  0.1426, -