In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\Shubham\Downloads\100_Unique_QA_Dataset (1).csv")

df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
def tokenize(text):
    text = text.lower()
    text = text.replace("?","")
    text = text.replace("'","")
    return text.split()

In [4]:
tokenize("What is the capital of FraNCE?")

['what', 'is', 'the', 'capital', 'of', 'france']

In [5]:
# vocab
vocab = {'<UNK>':0}

In [8]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])    
    
    merged_tokens = tokenized_question + tokenized_answer
    
    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [9]:
df.apply(build_vocab,axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [10]:
len(vocab)

324

In [13]:
def text_to_indices(text, vocab):
    indexed_text = []
    
    for token in tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
            
        else:
            indexed_text.append(vocab['<UNK>'])
            
    return indexed_text

In [14]:
text_to_indices("What is campusx", vocab)

[1, 2, 0]

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader



In [18]:
class QADataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        numerical_question = text_to_indices(self.df.iloc[idx]['question'],self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[idx]['answer'],self.vocab)        
        return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [19]:
dataset = QADataset(df, vocab)

In [20]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [21]:
for question, answer in dataloader:
    print(question, answer[0])

tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([260])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([215])
tensor([[  1,   2,   3,   4,   5, 113]]) tensor([114])
tensor([[ 42, 200,   2,  14, 201, 202, 203, 204]]) tensor([205])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([58])
tensor([[ 10,  11, 189, 158, 190]]) tensor([191])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([113])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([244])
tensor([[  1,   2,   3, 122, 123,  19,   3,  45]]) tensor([124])
tensor([[ 10, 308,   3, 309, 310]]) tensor([311])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([246])
tensor([[ 10,  29, 130, 131]]) tensor([132])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([273])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([184])
tensor([[  1,   2,   3, 103,   5, 104,  19, 105]]) tensor([106])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([249])
tensor([

In [22]:
import torch.nn as nn

In [24]:
class SimpleRNN(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)
        
    def forward(self, question):
        embedded_question = self.embedding(question)
        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))
        
        return output

In [30]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
print(a)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)
print(d)
e = z(d.squeeze(0))

print("shape of e:", e.shape)
print(e)

shape of a: torch.Size([1, 6])
tensor([[1, 2, 3, 4, 5, 6]])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
tensor([[[-0.8057, -0.5245,  0.5298, -0.2934,  0.6493,  0.4695, -0.5646,
          -0.7875,  0.0994,  0.6195, -0.6238, -0.2444, -0.4075,  0.0876,
           0.7800, -0.0189,  0.0228,  0.3839, -0.5533,  0.5400,  0.0664,
           0.2689, -0.2004,  0.2209,  0.3738,  0.2443, -0.1477, -0.4467,
          -0.0965,  0.8675, -0.0134, -0.7055, -0.4149,  0.0583,  0.2238,
           0.0751, -0.7544, -0.8150,  0.0750, -0.4997,  0.7260,  0.8579,
          -0.0591, -0.9727, -0.2486,  0.2356, -0.3411,  0.6518,  0.7264,
          -0.3310, -0.5263, -0.4822, -0.1302, -0.5407, -0.2336, -0.3029,
           0.6174,  0.8679, -0.3832,  0.7346,  0.0817,  0.1723,  0.6434,
           0.2013]]], grad_fn=<StackBackward0>)
shape of e: torch.Size([1, 324])
tensor([[ 0.1210, -0.1566,  0.6815,  0.5042,  0.3324, -0.2641, -0.0181,  0.1975,
          0.2588

In [31]:
learning_rate = 0.001
epochs = 20

In [32]:
model = SimpleRNN(len(vocab))

In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [36]:
for epoch in range(epochs):
    total_loss = 0
    for question, answer in dataloader:
        optimizer.zero_grad()
        
        y_pred = model(question)
        
        loss = criterion(y_pred,answer[0])
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        
    print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 520.272993
Epoch: 2, Loss: 451.003444
Epoch: 3, Loss: 371.081353
Epoch: 4, Loss: 310.993535
Epoch: 5, Loss: 260.085326
Epoch: 6, Loss: 212.752802
Epoch: 7, Loss: 168.929069
Epoch: 8, Loss: 131.544700
Epoch: 9, Loss: 100.768411
Epoch: 10, Loss: 77.091209
Epoch: 11, Loss: 58.580523
Epoch: 12, Loss: 45.828790
Epoch: 13, Loss: 36.532852
Epoch: 14, Loss: 29.593724
Epoch: 15, Loss: 24.341857
Epoch: 16, Loss: 20.657253
Epoch: 17, Loss: 17.449731
Epoch: 18, Loss: 14.884026
Epoch: 19, Loss: 12.910392
Epoch: 20, Loss: 11.044856


In [58]:
def predict(model, question, threshold = 0.5):
    
    numerical_question = text_to_indices(question, vocab)
    reverse_dict = {v:k for k,v in vocab.items()}
    question_tensor = torch.tensor(numerical_question).unsqueeze(0)
    output = model(question_tensor)
    
    probs = torch.nn.functional.softmax(output, dim=1)
    value, index = torch.max(probs, dim=1)
    
    if value < threshold:
        print("I don't know")
    else:
        print(reverse_dict[index.item()])

In [59]:
predict(model,"What is the capital of the France")

paris
