In [1]:
import gdown

In [2]:
file_id='1X4Hcj72NK7J2JYvgjICFj0R1XwUq1w0a'
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'data.csv', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1X4Hcj72NK7J2JYvgjICFj0R1XwUq1w0a
To: /content/data.csv
100%|██████████| 4.28k/4.28k [00:00<00:00, 8.49MB/s]


'data.csv'

In [3]:
import pandas as pd
df=pd.read_csv('data.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [4]:
def tokenize(text):
  text=text.lower()
  text=text.replace("?","")
  text=text.replace("'","")
  return text.split()

In [5]:
def vocab(df):
  vocab_dic={'<UNK>':0}
  for i in range(len(df)):
    temp=tokenize(df.iloc[i,0])+tokenize(df.iloc[i,1])
    for j in temp:
      if j not in vocab_dic:
        vocab_dic[j]=len(vocab_dic)
  return vocab_dic

In [6]:
voc=vocab(df)

In [7]:
def text_to_index(text,voc):
  indexed_text=[]
  for token in tokenize(text):
    if token in voc:
      indexed_text.append(voc[token])
    else:
      indexed_text.append(voc['<UNK>'])
  return indexed_text

In [8]:
text_to_index('what is shehzail',voc)

[1, 2, 0]

In [13]:
import torch
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn

In [10]:
class MyDataset(Dataset):
  def __init__(self,df,voc):
    self.df=df
    self.voc=voc
  def __len__(self):
    return len(self.df)
  def __getitem__(self,index):
    question=text_to_index(self.df.iloc[index,0],self.voc)
    answer=text_to_index(self.df.iloc[index,1],self.voc)
    return torch.tensor(question),torch.tensor(answer)

In [11]:
data=MyDataset(df,voc)
data[5]

(tensor([10, 29,  3, 30, 31]), tensor([32]))

In [12]:
dataloader=DataLoader(data,batch_size=1,shuffle=True)

In [32]:
class SimpleRNN(nn.Module):
  def __init__(self,voc_size):
    super().__init__()
    self.embedding=nn.Embedding(voc_size,50)
    self.rnn=nn.RNN(50,64,batch_first=True)
    self.fc=nn.Linear(64,voc_size)

  def forward(self,question):
    x=self.embedding(question)
    hidden,output=self.rnn(x)
    output= self.fc(output)
    output.squeeze_(0)
    return output


In [33]:
learningRate=0.001
epoches=20

In [34]:
model=SimpleRNN(len(voc))
criterion=nn.CrossEntropyLoss()
optim=torch.optim.Adam(model.parameters(),lr=learningRate)

In [38]:
#training loop
for epoch in range(epoches):
  totalLoss=0
  for question,answer in dataloader:
    optim.zero_grad()
    output=model(question)
    loss=criterion(output,answer[0])
    loss.backward()
    optim.step()
    totalLoss+=loss.item()
  print(f"epoch:{epoch} Loss:{totalLoss:4f}")

epoch:0 Loss:9.588748
epoch:1 Loss:8.409181
epoch:2 Loss:7.481580
epoch:3 Loss:6.659763
epoch:4 Loss:5.982249
epoch:5 Loss:5.409978
epoch:6 Loss:4.875840
epoch:7 Loss:4.440406
epoch:8 Loss:4.057129
epoch:9 Loss:3.723705
epoch:10 Loss:3.412794
epoch:11 Loss:3.153276
epoch:12 Loss:2.910823
epoch:13 Loss:2.695714
epoch:14 Loss:2.500695
epoch:15 Loss:2.330232
epoch:16 Loss:2.166732
epoch:17 Loss:2.026704
epoch:18 Loss:1.887058
epoch:19 Loss:1.764694


In [41]:
model.eval()

SimpleRNN(
  (embedding): Embedding(324, 50)
  (rnn): RNN(50, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=324, bias=True)
)

In [67]:
def predict(text):
  with torch.no_grad():
    output=model(torch.tensor(text_to_index(text,voc)).unsqueeze_(0))
    softmax=nn.Softmax(dim=-1)
    output=softmax(output)
    pred=torch.argmax(output)
    if output[0][pred]>0.5:
      print(list(voc.keys())[int(pred)])
    else:
      print("I don't know")

In [73]:
predict("who is the author of To Kill a Mockingbird'")

harper-lee
