In [1]:
import torch

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('qa_dataset.csv')

In [4]:
def tokenize(text,question = True):
    if not question:
        return list(text.replace(' ','').split())
    text = text.lower()
    text = text.replace('?','')
    return text.split()

In [5]:
import random
tokenize(data['question'][81])

['what', 'is', 'the', 'capital', 'of', 'canada']

In [6]:
vocab = {'<unk>':0}
for q,a in zip(data['question'],data['answer']):
    tokenized_q = tokenize(q)
    tokenized_a = str(a)
    if isinstance(a,str):
     tokenized_a = tokenize(a,False)
    for word in tokenized_q+tokenized_a :
        if word not in vocab.keys():
            vocab[word] = len(vocab)

In [7]:
def text_to_indices(text,vocab,question=True):
    t_to_id = []
    if not question:
        tokenized_text = str(text)
        tokenized_text = tokenize(text,False)
    else:
        tokenized_text = tokenize(text)
    for t in tokenized_text:
        if t in vocab.keys(): 
            t_to_id.append(vocab[t])
        else:
            t_to_id.append(vocab['<unk>'])
    return t_to_id

In [8]:
data['question'][109],text_to_indices(data['question'][109],vocab)

('What is the capital of Nigeria?', [15, 204, 3, 150, 40, 300])

In [9]:
from torch.utils.data import Dataset, DataLoader

In [10]:
class Custom_Dataset(Dataset):
    def __init__(self,data,vocab):
        super().__init__()
        self.data = data
        self.vocab = vocab
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self,index):
        text_to_id_question = text_to_indices(self.data['question'][index],self.vocab)
        text_to_id_answer = text_to_indices(self.data['answer'][index],self.vocab, False)
        return torch.tensor(text_to_id_question) , torch.tensor(text_to_id_answer)

In [12]:
dataset = Custom_Dataset(data,vocab)
dataset[109]

(tensor([ 15, 204,   3, 150,  40, 300]), tensor([301]))

In [13]:
# dataloader : 
dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
dataloader,len(dataloader)

(<torch.utils.data.dataloader.DataLoader at 0x249d8775590>, 200)

In [14]:
# rnn architecture  :  embeddings = 60, hidden layers = 1 , hidden_neurons = 128, output_neurons = vocab_size
# Each index in the input tensor is embedded into a vector of 60 length 
import torch.nn as nn

In [15]:
class simpleRNN(nn.Module):
    def __init__(self,embedding,hidden,vocab_size):
        super().__init__()
        self.vocab = vocab_size
        self.embd = nn.Embedding(num_embeddings = self.vocab , embedding_dim = embedding)
        self.rnn = nn.RNN(input_size = embedding , hidden_size =  hidden, batch_first=  True)    # sequential layer can't be used...
        self.out = nn.Linear(hidden,self.vocab)
    def forward(self,text):
        x = self.embd(text)
        hidden_states , final_hidden_state = self.rnn(x)
        return self.out(final_hidden_state.squeeze(0))

In [16]:
torch.manual_seed(42)
rnn_model = simpleRNN(60,128,len(vocab))

In [22]:
loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = rnn_model.parameters(), lr = 0.001)

In [23]:
epochs = 20
rnn_model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for data in dataloader:
        optimizer.zero_grad()
        pred = rnn_model(data[0])
        # print(pred.shape)
        # print(data[1].shape)
        loss = loss_fun(pred,data[1].squeeze(0))
        epoch_loss += loss
        loss.backward()
        optimizer.step()
    print(f'Epoch : {epoch+1} | train_loss : {epoch_loss/len(dataloader)}')

Epoch : 1 | train_loss : 6.222723007202148
Epoch : 2 | train_loss : 4.649641513824463
Epoch : 3 | train_loss : 3.255985975265503
Epoch : 4 | train_loss : 2.0758237838745117
Epoch : 5 | train_loss : 1.1657018661499023
Epoch : 6 | train_loss : 0.6397266387939453
Epoch : 7 | train_loss : 0.37333792448043823
Epoch : 8 | train_loss : 0.22717109322547913
Epoch : 9 | train_loss : 0.1427544355392456
Epoch : 10 | train_loss : 0.09989340603351593
Epoch : 11 | train_loss : 0.07350881397724152
Epoch : 12 | train_loss : 0.05490255728363991
Epoch : 13 | train_loss : 0.04388480260968208
Epoch : 14 | train_loss : 0.03585062175989151
Epoch : 15 | train_loss : 0.02969992160797119
Epoch : 16 | train_loss : 0.024778805673122406
Epoch : 17 | train_loss : 0.021046902984380722
Epoch : 18 | train_loss : 0.01791607402265072
Epoch : 19 | train_loss : 0.01544689666479826
Epoch : 20 | train_loss : 0.013357547111809254


In [54]:
def predict(question,threshold = 0.1):
    tensor_input = torch.tensor(text_to_indices(question,vocab))
    pred_logits = rnn_model(tensor_input.unsqueeze(dim=0))
    pred_probs = nn.functional.softmax(pred_logits,dim=1)
    max_prob , prediction = torch.max(pred_probs,dim=1)
    print(f'max probability : {max_prob.item()}')
    if max_prob.item() < threshold:
        return 'No idea...'
    return list(vocab.keys())[prediction]

In [55]:
predict("Organ that filters blood?")

max probability : 0.9454696774482727


'Kidney'