In [106]:
import numpy as np
import pandas as pd

In [107]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


> ## `Data Loading`

In [108]:
import torch
from torch.utils.data import Dataset, DataLoader

class Preprocessing:
    def __init__(self):
        self.vocabulary = {"<UNK>": 0}

    def tokenize(self, doc: str) -> list:
        doc = doc.lower().replace('?', '').replace("'", '')
        return doc.split()

    def build_vocab(self, doc: str) -> None:
        for word in doc:
            if word not in self.vocabulary:
                self.vocabulary[word] = len(self.vocabulary)

    def text_to_index(self, doc: str) -> list:
        encoding = []
        for word in doc:
            if word in self.vocabulary:
                encoding.append(self.vocabulary[word])
            else:
                encoding.append(self.vocabulary["<UNK>"])
        return encoding

    def preprocess(self, data):
        # Tokenize
        data = data.apply(self.tokenize)

        # Build Vocabulary
        data.apply(self.build_vocab)

        # Encoding text to index
        data = data.apply(self.text_to_index)

        return data, self.vocabulary

class CustomDataset(Dataset):
    def __init__(self, data):
        self.preprocessing = Preprocessing()
        self.questions, _ = self.preprocessing.preprocess(data['question'])
        self.answers, self.vocabulary = self.preprocessing.preprocess(data['answer'])

    def __len__(self):
        return self.questions.shape[0]

    def __getitem__(self, index):
        question = torch.tensor(self.questions[index])
        answer = torch.tensor(self.answers[index])
        return question, answer

In [109]:
obj = CustomDataset(df)
obj[10]

(tensor([ 1,  2,  3,  4,  5, 43]), tensor([255]))

In [110]:
len(obj.vocabulary)

324

In [111]:
dataloader = DataLoader(dataset = obj, batch_size = 1, shuffle = True)

> ## `Model Architecture`

In [112]:
from torch import nn
from torch.nn import Module, Linear, Embedding, RNN

class CustomModel(Module):
    def __init__(self, vocabulary_size):
        super().__init__()
        self.embedding = Embedding(num_embeddings = vocabulary_size, embedding_dim = 50)
        self.rnn = RNN(input_size = 50, hidden_size = 64, num_layers = 1)
        self.linear = Linear(in_features = 64, out_features = 324)