In [1]:
import pandas as pd
df=pd.read_csv('/content/100_Unique_QA_Dataset.csv')

In [2]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
#tokenize
def tokenize(text):
  text=text.lower()
  text.replace('?','')
  text=text.replace("'","")
  return text.split()







In [4]:
tokenize('What is the capital of Germany?')

['what', 'is', 'the', 'capital', 'of', 'germany?']

In [5]:
#vocab
vocab={'<UNK>':0}

In [6]:
def build_vocab(row):
  tokenized_question=tokenize(row['question'])
  tokenized_answer=tokenize(row['answer'])
  merged_tokens=tokenized_question+tokenized_answer
  for token in merged_tokens:

    if token not in vocab:
      vocab[token]=len(vocab)

In [7]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [8]:
len(vocab)

336

In [9]:
#convert words to  numerical indices
def text_to_indices(text,vocab):
  indexed_text=[]
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
        indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [10]:
text_to_indices("what is campusx",vocab)

[1, 2, 0]

In [11]:
import torch
from torch.utils.data import Dataset,DataLoader

In [12]:
class QADataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab
  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    numerical_question=text_to_indices(self.df.iloc[index]['question'],self.vocab)
    numerical_answer=text_to_indices(self.df.iloc[index]['answer'],self.vocab)
    return torch.tensor(numerical_question),torch.tensor(numerical_answer)


In [13]:
dataset=QADataset(df,vocab)

In [14]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [15]:
for question,answer in dataloader:
  print(question,answer[0])

tensor([[ 10, 146,   3, 147, 177,   5,   3,  72, 178]]) tensor([179])
tensor([[ 42, 206,   2,  14, 207, 208, 209, 210]]) tensor([211])
tensor([[ 42, 120, 121,   3, 122, 123, 124]]) tensor([125])
tensor([[ 42,   2,   3, 216, 142, 174, 217, 175]]) tensor([218])
tensor([[ 42,  18, 121,   3, 192, 193]]) tensor([194])
tensor([[ 10,  77,   3, 307,  19, 308]]) tensor([309])
tensor([[42, 18,  2, 63, 64,  3, 65, 66]]) tensor([67])
tensor([[ 42, 273, 274,  14, 275, 276, 164, 277]]) tensor([278])
tensor([[  1,   2,   3,   4,   5, 246, 247]]) tensor([248])
tensor([[ 42, 265,   2, 266,  85, 267, 268]]) tensor([269])
tensor([[ 1,  2,  3, 71,  5, 54]]) tensor([270])
tensor([[  1,   2,   3, 147, 120,  85,   3, 288, 289]]) tensor([125])
tensor([[ 42, 310, 311, 121,  14, 312, 313, 164, 314, 315, 316, 317]]) tensor([318])
tensor([[ 10,  11, 163, 164, 165]]) tensor([166])
tensor([[ 42, 142,   2, 234,  12,   3, 235, 236]]) tensor([237])
tensor([[ 42, 142,   2,  63,  39,   3, 333, 334]]) tensor([335])
tenso

In [16]:
  import torch.nn as nn


In [17]:
class SimpleRNN(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn=nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,vocab_size)
  def forward(self,question):
    embedded_question=self.embedding(question)
    hidden,final=self.rnn(embedded_question)
    output=self.fc(final.squeeze(0))
    return output


In [18]:
x=nn.Embedding(324,embedding_dim=50)
y=nn.RNN(50,64,batch_first=True)
z=nn.Linear(64,324)

a=dataset[0][0].reshape(1,6)
print("Shape of a:",a.shape)

b=x(a)
print("Shape of b:",b.shape)

c,d=y(b)
print("Shape of c:",c.shape)
print("Shape of d",d.shape)

e=z(d.squeeze(0))
print("Shape of e:",e.shape)


Shape of a: torch.Size([1, 6])
Shape of b: torch.Size([1, 6, 50])
Shape of c: torch.Size([1, 6, 64])
Shape of d torch.Size([1, 1, 64])
Shape of e: torch.Size([1, 324])


In [19]:
learning_rate=0.001
epochs=20

In [20]:
model=SimpleRNN(len(vocab))
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [21]:
#training loop
for epoch in range(epochs):
  total_loss=0
  for question,answer in dataloader:
    optimizer.zero_grad()

    #forward pass
    output=model(question)

    #loss->output shape(1,324)
    loss=criterion(output,answer[0])

    #gradients
    loss.backward()
    #update
    optimizer.step()

    total_loss=total_loss+loss.item()
    print(f"Epoch{epoch+1},Loss:{total_loss:4f}")

Epoch1,Loss:5.872055
Epoch1,Loss:11.932389
Epoch1,Loss:17.945444
Epoch1,Loss:23.948284
Epoch1,Loss:29.701139
Epoch1,Loss:35.665572
Epoch1,Loss:41.162794
Epoch1,Loss:46.693023
Epoch1,Loss:51.914065
Epoch1,Loss:57.707977
Epoch1,Loss:63.251677
Epoch1,Loss:68.819987
Epoch1,Loss:74.676688
Epoch1,Loss:79.898100
Epoch1,Loss:85.497821
Epoch1,Loss:91.487199
Epoch1,Loss:97.238551
Epoch1,Loss:103.109483
Epoch1,Loss:108.700855
Epoch1,Loss:114.370463
Epoch1,Loss:120.204878
Epoch1,Loss:126.170300
Epoch1,Loss:132.228161
Epoch1,Loss:138.138210
Epoch1,Loss:143.817849
Epoch1,Loss:150.220146
Epoch1,Loss:156.161940
Epoch1,Loss:162.063722
Epoch1,Loss:167.952170
Epoch1,Loss:173.902268
Epoch1,Loss:180.464134
Epoch1,Loss:186.568309
Epoch1,Loss:191.815709
Epoch1,Loss:197.490710
Epoch1,Loss:203.506672
Epoch1,Loss:209.089813
Epoch1,Loss:214.941636
Epoch1,Loss:220.851202
Epoch1,Loss:226.319348
Epoch1,Loss:232.530189
Epoch1,Loss:237.918977
Epoch1,Loss:243.664187
Epoch1,Loss:249.912785
Epoch1,Loss:256.070394
Epoch1

In [22]:
def predict(model,question,threshold=0.5):
  #convert question to numbers
  numerical_question=text_to_indices(question,vocab)

  #tensor
  question_tensor=torch.tensor(numerical_question).unsqueeze(0)

  #send to model
  output=model(question_tensor)

  #coert to logits to probs
  probs=torch.nn.functional.softmax(output,dim=1)

  #findindex of max prob
  value,index=torch.max(probs,dim=1)

  if value < threshold:
    print("i dont know")

  print(list(vocab.keys())[index])


In [24]:
predict(model,"What is the freezing point of water in Fahrenheit?")


32


NameError: name 'index' is not defined