In [27]:
# Importing necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import Module, Embedding, RNN, Linear

In [7]:
# Loading the dataset
df = pd.read_csv(filepath_or_buffer="data/question answers.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [8]:
# Tokenizing the questions
def preprocess(text: str) -> list:
    # Lowercasing the words
    text = text.lower()

    # Removing the puncuation marks
    text = text.replace("?", "").replace("'", "")

    # Tokenizing
    text = text.split(sep=" ")

    # Returning
    return text

preprocess(text="What is the capital 'of france?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [9]:
# Vocabulary building
vocab = {'<UNK>':0}

def build_vocab(row):
    tokenized_question = preprocess(row['question'])
    tokenized_answer = preprocess(row['answer'])

    merged_tokens = tokenized_question + tokenized_answer

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

df.apply(build_vocab, axis=1)
print(len(vocab))

324


In [10]:
# convert words to numerical indices
def text_to_indices(text, vocab):
    indexed_text = []

    for token in preprocess(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

text_to_indices("My name is dexter", vocab)

[0, 0, 2, 0]

In [11]:
# Building custom dataset class
class CustomDataset(Dataset):
    def __init__(self, df, vocab):
        super().__init__()

        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        # Loading the row
        doc = self.df.iloc[index]

        # Converting to numerical indices
        question = text_to_indices(text=doc["question"], vocab=self.vocab)
        answer = text_to_indices(text=doc["answer"], vocab=self.vocab)

        # Returning
        return torch.tensor(question), torch.tensor(answer)

In [12]:
# Building Dataloader
dataset = CustomDataset(df=df, vocab=vocab)
dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)

In [None]:
# Building Model
class SimpleRNN(Module):
    def __init__(self):
        super().__init__()

        self.embedding = Embedding(num_embeddings=len(vocab), embedding_dim=50)
        self.rnn = RNN(input_size=50, hidden_size=64, batch_first=True)
        self.linear = Linear(in_features=64, out_features=len(vocab))

    def forward(self, question):
        embedded_question = self.embedding(question)
        hidden, final = self.rnn(embedded_question)
        output = self.linear(final.squeeze(0))

        return output

In [29]:
# Defining Parameters
learning_rate = 0.001
epochs = 25

In [31]:
# Defining model, loss function and optimizer
model = SimpleRNN()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

In [32]:
# training loop

for epoch in range(epochs):
    total_loss = 0

    for question, answer in dataloader:
        optimizer.zero_grad()

        # forward pass
        output = model(question)

        # loss -> output shape (1,324) - (1)
        loss = criterion(output, answer[0])

        # gradients
        loss.backward()

        # update
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 524.682529
Epoch: 2, Loss: 457.171937
Epoch: 3, Loss: 381.006697
Epoch: 4, Loss: 313.138003
Epoch: 5, Loss: 258.231454
Epoch: 6, Loss: 210.173810
Epoch: 7, Loss: 167.543902
Epoch: 8, Loss: 130.256757
Epoch: 9, Loss: 100.577890
Epoch: 10, Loss: 77.077694
Epoch: 11, Loss: 60.055889
Epoch: 12, Loss: 47.544352
Epoch: 13, Loss: 38.144438
Epoch: 14, Loss: 31.158295
Epoch: 15, Loss: 25.737061
Epoch: 16, Loss: 21.823089
Epoch: 17, Loss: 18.492960
Epoch: 18, Loss: 15.638866
Epoch: 19, Loss: 13.722632
Epoch: 20, Loss: 11.730864
Epoch: 21, Loss: 10.253676
Epoch: 22, Loss: 9.088438
Epoch: 23, Loss: 8.022047
Epoch: 24, Loss: 7.087289
Epoch: 25, Loss: 6.310489


In [35]:
def predict(model, question, threshold=0.5):

    # convert question to numbers
    numerical_question = text_to_indices(question, vocab)

    # tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)

    # send to model
    output = model(question_tensor)

    # convert logits to probs
    probs = torch.nn.functional.softmax(output, dim=1)

    # find index of max prob
    value, index = torch.max(probs, dim=1)

    if value < threshold:
        print("I don't know")

    print(list(vocab.keys())[index])

In [36]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [19]:
x = Embedding(324, embedding_dim=50)
y = RNN(50, 64, batch_first=True)
z = Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)

b = x(a)
print("shape of b:", b.shape)

c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [26]:
c[0][5] == d

tensor([[[True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True, True, True, True]]])