In [1]:
import torch
import torch.nn as nn
from torch.utils.data  import Dataset,DataLoader
import numpy as np
import pandas as pd

In [2]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
df=pd.read_csv("data_rnn.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re

In [13]:
stop_words=set(stopwords.words("english"))

In [12]:
def tokenization(text):
  text=text.lower()
  text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
  tokens=word_tokenize(text)
  tokens= [t for t in tokens if t not in string.punctuation]
  return tokens

In [14]:
tokenization("devil 'is' evil")

['devil', 'is', 'evil']

In [17]:
vocab={"<UNK>":0}

In [19]:
#creating vocab
def create_vocab(row):
  print(row['question'],row['answer'])
  tokened_question=tokenization(row['question'])
  tokened_answer=tokenization(row['answer'])
  merged_token=tokened_question+tokened_answer
  for token in merged_token:
    if token not in vocab:
      vocab[token]=len(vocab)

In [20]:
vocab

{'<UNK>': 0}

In [21]:
df.apply(create_vocab,axis=1)


What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [22]:
len(vocab)

324

In [23]:
def text_to_indices(text,vocab):
  indexed_text=[]
  for token in tokenization(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab["<UNK>"])
  return indexed_text

In [24]:
text_to_indices("where is india",vocab)

[0, 2, 73]

In [25]:
class MyDataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self,index):
   num_q= text_to_indices(self.df.iloc[index]['question'],self.vocab)
   num_a=text_to_indices(self.df.iloc[index]['answer'],self.vocab)
   return torch.tensor(num_q),torch.tensor(num_a)

In [26]:
dataset=MyDataset(df,vocab)

In [27]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [28]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)


In [30]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [31]:
epochs =50
learning_rate=0.001
model = SimpleRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)






In [32]:
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [33]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 523.712223
Epoch: 2, Loss: 458.089159
Epoch: 3, Loss: 381.316672
Epoch: 4, Loss: 320.497842
Epoch: 5, Loss: 267.850753
Epoch: 6, Loss: 219.626464
Epoch: 7, Loss: 175.074743
Epoch: 8, Loss: 136.391486
Epoch: 9, Loss: 105.096021
Epoch: 10, Loss: 80.730891
Epoch: 11, Loss: 62.010368
Epoch: 12, Loss: 48.440881
Epoch: 13, Loss: 38.909123
Epoch: 14, Loss: 30.988472
Epoch: 15, Loss: 25.690768
Epoch: 16, Loss: 21.191086
Epoch: 17, Loss: 18.025884
Epoch: 18, Loss: 15.265665
Epoch: 19, Loss: 13.168659
Epoch: 20, Loss: 11.401785
Epoch: 21, Loss: 9.951072
Epoch: 22, Loss: 8.845055
Epoch: 23, Loss: 7.781860
Epoch: 24, Loss: 7.001473
Epoch: 25, Loss: 6.216836
Epoch: 26, Loss: 5.633423
Epoch: 27, Loss: 5.090116
Epoch: 28, Loss: 4.608593
Epoch: 29, Loss: 4.222181
Epoch: 30, Loss: 3.841997
Epoch: 31, Loss: 3.543508
Epoch: 32, Loss: 3.234192
Epoch: 33, Loss: 3.008668
Epoch: 34, Loss: 2.774799
Epoch: 35, Loss: 2.566243
Epoch: 36, Loss: 2.382213
Epoch: 37, Loss: 2.210644
Epoch: 38, Loss: 2

In [34]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [37]:
predict(model,"capital of india")

delhi
