In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("D:/Intern/DataSets/100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [27]:
df.shape

(90, 2)

In [3]:
# Tokenize
def tokenize(text):
    text = text.lower()
    text = text.replace("'", "")
    text = text.strip(",.?!;:")
    text = text.split()
    return text

In [4]:
# Vocab
vocab = {"<OOV>": 0}

def build_vocab(row):
	tokenized_que = tokenize(row["question"])
	tokenized_ans = tokenize(row["answer"])
 
	merged_tokens = tokenized_que + tokenized_ans
 
	for token in merged_tokens:
		if token not in vocab:
			vocab[token] = len(vocab)

In [5]:
df.apply(build_vocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [6]:
vocab_size = len(vocab)
vocab_size

324

In [7]:
# text to sequence
def text_to_sequence(text, vocab):
	indexed_text = []
	
	for token in tokenize(text):
		if token in vocab:
			indexed_text.append(vocab[token])
		else:
			indexed_text.append(vocab["<OOV>"])
   
	return indexed_text

In [8]:
text_to_sequence("What is the capital of France?", vocab)


[1, 2, 3, 4, 5, 6]

In [9]:
class QADataset(Dataset):
	def __init__(self, df, vocab):
		self.df = df
		self.vocab = vocab

	def __len__(self):
		return self.df.shape[0]

	def __getitem__(self, index):
		seq_question = text_to_sequence(self.df.iloc[index]["question"], vocab)
		seq_answer = text_to_sequence(self.df.iloc[index]["answer"], vocab)

		return torch.tensor(seq_question), torch.tensor(seq_answer)

In [10]:
dataset = QADataset(df, vocab)

In [11]:
print(dataset[0]) # 1st question
print(dataset[1]) # 2nd question

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))
(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))


In [12]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [13]:
for que, ans in dataloader:
	print(que)
	print(ans)
	break

tensor([[  1,   2,   3,   4,   5, 206]])
tensor([[207]])


In [14]:
class RNN(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # architecture
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)
        
    def forward(self, x):
        embedded_text = self.embedding(x)
        _, final = self.rnn(embedded_text) # returns output of all time steps and last hidden state
        output = self.fc(final)
        return output

In [15]:
model = RNN(vocab_size)
criterian = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [16]:
epochs = 20

for epoch in range(epochs):
    total_loss = 0
    for que, ans in dataloader:
        
        optimizer.zero_grad()
        #foreward
        output = model(que) 
        #reshape output and target
        output = output.view(-1, vocab_size) # size =  (1, vocab_size)
        ans = ans.view(-1)
        #loss
        loss = criterian(output, ans)
		#backward
        loss.backward()
        # update
        optimizer.step()
        
        total_loss += loss.item()
    print(f"Epoch: {epoch}, Loss: {total_loss}")

Epoch: 0, Loss: 525.479838848114
Epoch: 1, Loss: 456.88055658340454
Epoch: 2, Loss: 380.80167055130005
Epoch: 3, Loss: 322.0150029659271
Epoch: 4, Loss: 272.20937633514404
Epoch: 5, Loss: 224.77428531646729
Epoch: 6, Loss: 181.2981674671173
Epoch: 7, Loss: 142.20161765813828
Epoch: 8, Loss: 109.85549718141556
Epoch: 9, Loss: 84.66838383674622
Epoch: 10, Loss: 64.31596612930298
Epoch: 11, Loss: 50.12574838101864
Epoch: 12, Loss: 39.58172160387039
Epoch: 13, Loss: 31.953929141163826
Epoch: 14, Loss: 26.58656220883131
Epoch: 15, Loss: 21.93062974512577
Epoch: 16, Loss: 18.60133097320795
Epoch: 17, Loss: 15.996194034814835
Epoch: 18, Loss: 13.676321387290955
Epoch: 19, Loss: 11.871712025254965


In [17]:
with torch.no_grad():
    for que, ans in dataloader:
        optput = model(que)
        output = output.view(-1, vocab_size)
        ans = ans.view(-1)
        loss = criterian(output, ans)
        print(f"Loss: {loss.item()}")
        break

Loss: 7.576819896697998


In [20]:
def predict(model, question, threshold=0.5):
    # convert que to num
	que = text_to_sequence(question, vocab)
	# tensor
	que = torch.tensor(que)
	# reshape
	que = que.view(1, -1) # (1, seq_len)
	
	# predict
	output = model(que)
	output = output.view(-1, vocab_size) # (1, vocab_size)
	
	# convert logits to prob
	prob = torch.nn.functional.softmax(output, dim=1)
 
	# find index from softmax prob
	value, index = torch.max(prob, dim=1)
 
	if value < threshold:
		print("I don't know")
	else:
		print("Answer:", list(vocab.keys())[index])

In [23]:
print(df.sample(5))

                                             question    answer
56  Which is the second-largest country by land area?    Canada
47      What is the longest-running animated TV show?  Simpsons
31              Which city is known as the Big Apple?   NewYork
43    What is the hardest natural substance on Earth?   Diamond
78            Which planet is the closest to the Sun?   Mercury


In [None]:
predict(model, "Where is the capital of France?")
predict(model, "Where is the capital of Germany?")

Answer: paris
Answer: berlin


In [26]:
predict(model, "What is the largest planet in our solar system?")
predict(model, "What is the longest-running animated TV show?")
predict(model, "Which city is known as the Big Apple?")


Answer: jupiter
Answer: simpsons
Answer: newyork
