In [4]:
import pandas as pd
df=pd.read_csv('general_knowledge_qa.csv')

In [5]:
df.head()

Unnamed: 0,question,answer,question_type,image
0,How many days do we have in a week?,Seven,General Knowledge For Kids,
1,How many days are there in a normal year?,365 (not a leap year),General Knowledge For Kids,
2,How many colors are there in a rainbow?,7,General Knowledge For Kids,
3,Which animal is known as the ‘Ship of the Dese...,Camel,General Knowledge For Kids,
4,How many letters are there in the English alph...,26,General Knowledge For Kids,


In [6]:
#tokenize
def tokenize(text):
  text=text.lower()
  text.replace('?','')
  text=text.replace("'","")
  return text.split()

In [7]:
tokenize('How many colors are there in a rainbow?')

['how', 'many', 'colors', 'are', 'there', 'in', 'a', 'rainbow?']

In [8]:
#vocab
vocab={'<UNK>':0}

In [9]:
def build_vocab(row):
  tokenized_question=tokenize(row['question'])
  tokenized_answer=tokenize(row['answer'])
  merged_tokens=tokenized_question+tokenized_answer
  for token in merged_tokens:

    if token not in vocab:
      vocab[token]=len(vocab)

In [10]:
df.apply(build_vocab,axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
925    None
926    None
927    None
928    None
929    None
Length: 930, dtype: object

In [11]:
len(vocab)

2544

In [12]:
#convert words to  numerical indices
def text_to_indices(text,vocab):
  indexed_text=[]
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
        indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [13]:
text_to_indices("what is campusx",vocab)

[66, 24, 0]

In [14]:
import torch
from torch.utils.data import Dataset,DataLoader

In [15]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab
  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)
    return torch.tensor(numerical_question),torch.tensor(numerical_answer)


In [16]:
dataset=QADataset(df,vocab)

In [18]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [19]:
for question,answer in dataloader:
  print(question,answer[0])

tensor([[ 693,   11,   27, 1886,   29, 2529]]) tensor([1709,  399])
tensor([[66,  4, 67, 68,  8, 69, 70, 29, 71]]) tensor([72])
tensor([[ 136, 2107,   27, 2108, 2109,  288, 2110]]) tensor([1829,  719])
tensor([[ 693,   24,   27, 1886,   29, 1945, 1946,   27, 1596, 1947, 1948,  121,
         1949, 1950, 1616]]) tensor([1951])
tensor([[2139,   24, 2140,  913,   27, 2122, 2141,  913,   22, 1031,   29,   27,
         2142, 2143, 1495]]) tensor([2144])
tensor([[   8,  882, 1174,   24,   56,    8, 1156]]) tensor([1175])
tensor([[  66,   24,   27,  281,   29,    8,  204, 1347]]) tensor([1257])
tensor([[ 27, 491,  29,   8, 492,  24,  27, 244, 491,   7,  27, 493, 455, 288,
         456]]) tensor([455])
tensor([[ 66,  24, 909]]) tensor([903,  24,   8, 234, 343, 910, 140,  27, 849,  29, 684, 688])
tensor([[  22,   24,   27, 1830,  150,   29,  181,  241,  242]]) tensor([177])
tensor([[ 22,  24,  27, 244, 172,   7,  27,  74]]) tensor([245])
tensor([[1181,    4,    5,   68,  140, 1182, 1183, 1184]])

In [20]:
import torch.nn as nn


In [21]:
class SimpleRNN(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn=nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,vocab_size)
  def forward(self,question):
    embedded_question=self.embedding(question)
    hidden,final=self.rnn(embedded_question)
    output=self.fc(final.squeeze(0))
    return output


In [22]:
x=nn.Embedding(324,embedding_dim=50)
y=nn.RNN(50,64,batch_first=True)
z=nn.Linear(64,324)

a=dataset[0][0].reshape(1,9)
print("Shape of a:",a.shape)

b=x(a)
print("Shape of b:",b.shape)

c,d=y(b)
print("Shape of c:",c.shape)
print("Shape of d",d.shape)

e=z(d.squeeze(0))
print("Shape of e:",e.shape)


Shape of a: torch.Size([1, 9])
Shape of b: torch.Size([1, 9, 50])
Shape of c: torch.Size([1, 9, 64])
Shape of d torch.Size([1, 1, 64])
Shape of e: torch.Size([1, 324])


In [23]:
learning_rate=0.001
epochs=20

In [24]:
model=SimpleRNN(len(vocab))
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [25]:
#training loop
for epoch in range(epochs):
  total_loss = 0
  for question, answer in dataloader:
    optimizer.zero_grad()

    #forward pass
    output = model(question)

    #loss->output shape(1,324)
    # The error occurs here because `answer` has a shape like (1, n) and squeezing it removes the batch dimension
    # when n is 1. We need to ensure the batch dimension is present for the loss function.
    # The target for CrossEntropyLoss should be a 1D tensor of class indices.
    # We take the first token of the answer sequence as the target.
    loss = criterion(output, answer[:, 0])

    #gradients
    loss.backward()
    #update
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(dataloader):.4f}")

Epoch 1, Average Loss: 7.4515
Epoch 2, Average Loss: 5.5021
Epoch 3, Average Loss: 4.3659
Epoch 4, Average Loss: 3.3992
Epoch 5, Average Loss: 2.5420
Epoch 6, Average Loss: 1.8212
Epoch 7, Average Loss: 1.2584
Epoch 8, Average Loss: 0.8627
Epoch 9, Average Loss: 0.5967
Epoch 10, Average Loss: 0.4273
Epoch 11, Average Loss: 0.3162
Epoch 12, Average Loss: 0.2416
Epoch 13, Average Loss: 0.1763
Epoch 14, Average Loss: 0.1367
Epoch 15, Average Loss: 0.0997
Epoch 16, Average Loss: 0.0741
Epoch 17, Average Loss: 0.0717
Epoch 18, Average Loss: 0.0587
Epoch 19, Average Loss: 0.0409
Epoch 20, Average Loss: 0.0310


In [None]:
def predict(model, question_text, vocab):
  # Tokenize and convert question to numerical indices
  numerical_question = text_to_indices(question_text, vocab)
  # Convert to a PyTorch tensor and reshape for the model
  question_tensor = torch.tensor(numerical_question).reshape(1, -1)

  # Get model output
  model.eval() # Set model to evaluation mode
  with torch.no_grad(): # Disable gradient calculation
    output = model(question_tensor)

  # Get the predicted answer index
  predicted_answer_idx = torch.argmax(output).item()

  # Convert the index back to a word
  # Invert the vocab dictionary to map indices back to words
  idx_to_word = {idx: word for word, idx in vocab.items()}
  predicted_answer_word = idx_to_word.get(predicted_answer_idx, '<UNK>')

  return predicted_answer_word

In [27]:
predict(model,"Where are the HeadQuarters of UNESCO?", vocab)

'paris,'

In [28]:
import torch

torch.save(model.state_dict(), "qa_model.pth")

In [30]:
import torch

torch.save(vocab, "vocab.pth")

In [31]:
import torch

vocab = torch.load("vocab.pth")
