In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
# check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# load dataset
df = pd.read_csv('/kaggle/input/question-answer-dataset/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [67]:
df.shape

(90, 2)

In [4]:
# tokenize
def tokenize(text):
    text = text.lower() # convert to lower case
    text = text.replace('?', '') # replace marks
    text.replace("'", '') # replace inverted commas
    return text.split()

In [5]:
tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [6]:
# Vocab formation
vocab = {'<UNK>': 0}

In [7]:
# build vocab
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])
    merged_tokens = tokenized_question + tokenized_answer

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [8]:
df.apply(build_vocab, axis = 1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [9]:
len(vocab)

326

In [10]:
# numerical indices
def text_to_indices(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])
    return indexed_text

In [11]:
text_to_indices('What is Machine learning', vocab)

[1, 2, 0, 0]

In [12]:
# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab
    
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        questions = text_to_indices(self.df.iloc[index]['question'], vocab)
        answers = text_to_indices(self.df.iloc[index]['answer'], vocab)
        
        return torch.tensor(questions), torch.tensor(answers)

In [13]:
# CustomDataset Object
dataset = CustomDataset(df, vocab)

In [14]:
# DataLoader Object
dataloader = DataLoader(dataset, batch_size = 1, shuffle = True)

In [38]:
# RNN architecture
class RnnModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim = 50)
        self.rnn = nn.RNN(50, 64, batch_first = True)
        self.fc = nn.Linear(64, vocab_size)
    
    def forward(self, x):
        embedded_question = self.embedding(x)
        output, hidden = self.rnn(embedded_question)
        last_hidden = hidden.squeeze(0)
        logits = self.fc(last_hidden)
        return logits

In [45]:
# define learning_rate, epochs
learning_rate = 0.001
epochs = 30

In [46]:
# define model, loss and optimizer
model = RnnModel(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [47]:
# training loop
for epoch in range(epochs):
    total_loss = 0
    for question, answer in dataloader:
        question = question.to(device)
        answer = answer.to(device).long().squeeze(1)
        output = model(question)
        loss = criterion(output, answer)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f'Epoch: {epoch + 1}, Loss: {avg_loss}') 

Epoch: 1, Loss: 5.819894658194648
Epoch: 2, Loss: 5.011327992545234
Epoch: 3, Loss: 4.123075302441915
Epoch: 4, Loss: 3.4610490136676364
Epoch: 5, Loss: 2.8939709226290384
Epoch: 6, Loss: 2.364231224854787
Epoch: 7, Loss: 1.8939841707547507
Epoch: 8, Loss: 1.4779869059721629
Epoch: 9, Loss: 1.136036115884781
Epoch: 10, Loss: 0.8736880951457553
Epoch: 11, Loss: 0.6812333583831787
Epoch: 12, Loss: 0.535385994778739
Epoch: 13, Loss: 0.43422271990113787
Epoch: 14, Loss: 0.3555778374274572
Epoch: 15, Loss: 0.29581576221519046
Epoch: 16, Loss: 0.2537651984228028
Epoch: 17, Loss: 0.21894522913628153
Epoch: 18, Loss: 0.19117710466186205
Epoch: 19, Loss: 0.17016102034184669
Epoch: 20, Loss: 0.1472072305364741
Epoch: 21, Loss: 0.12827739214731587
Epoch: 22, Loss: 0.11486126031312678
Epoch: 23, Loss: 0.10116860179437531
Epoch: 24, Loss: 0.08966449089348316
Epoch: 25, Loss: 0.07930262655847603
Epoch: 26, Loss: 0.0704835097822878
Epoch: 27, Loss: 0.06310173101309273
Epoch: 28, Loss: 0.0567378566082

In [61]:
# prediction
def predict(model, question, threshold = 0.5):
    numeric_question = text_to_indices(question, vocab) # convert to numbers
    numeric_question = torch.tensor(numeric_question).unsqueeze(0) # convert to tensor
    numeric_question = numeric_question.to(device)
    model = model.to(device) # send to model
    with torch.no_grad():
        output = model(numeric_question)
        probs = torch.softmax(output, dim = 1) # calculate probability
        value, index = torch.max(probs, dim = 1)

    if value < threshold:
        print("I don't know")

    print(list(vocab.keys())[index])

In [62]:
predict(model, 'What is the capital of France')

paris


In [64]:
# eval mode
model.eval()

RnnModel(
  (embedding): Embedding(326, 50)
  (rnn): RNN(50, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=326, bias=True)
)

In [66]:
# Evaluation
with torch.no_grad():
    total = 0
    correct = 0
    for question, answer in dataloader:
        question = question.to(device)
        answer = answer.to(device).long().squeeze(1)
        output = model(question)
        _, predicted = torch.max(output, dim = 1)
        total += answer.shape[0]
        correct += (predicted == answer).sum().item()
    accuracy = correct / total
print(f'Accuracy: {accuracy}')
        

Accuracy: 1.0


# 1.0 Accuracy due to small dataset