<a href="https://colab.research.google.com/github/Vinsmoke-R/Breast-cancer/blob/main/RNN_using_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango


In [4]:
df.head(50)

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
5,Who painted the Mona Lisa?,Leonardo-da-Vinci
6,What is the square root of 64?,8
7,What is the chemical symbol for gold?,Au
8,Which year did World War II end?,1945
9,What is the longest river in the world?,Nile


In [5]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()

In [6]:
tokenize('Which superhero is also known as the Dark Knight?')

['which', 'superhero', 'is', 'also', 'known', 'as', 'the', 'dark', 'knight']

In [8]:
# vocab  (create a dictionary)
vocab ={'<UNK>':0}

In [17]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)

In [18]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [23]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [21]:
# convert words to numerical indices
def text_to_indices(text,vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [28]:
text_to_indices("What is her name", vocab)

[1, 2, 0, 0]

In [29]:
import torch
from torch.utils.data import Dataset,DataLoader

In [45]:
class QA_Dataset(Dataset):

  def __init__(self,df,vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [46]:
dataset = QA_Dataset(df, vocab)

In [48]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [49]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [78]:
for question, answer in dataloader:
  print(question, answer[0])

tensor([[ 42, 125,   2,  62,  63,   3, 126, 127]]) tensor([128])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([260])
tensor([[10,  2,  3, 66,  5, 67]]) tensor([68])
tensor([[  1,   2,   3, 180, 181, 182, 183]]) tensor([184])
tensor([[10, 29,  3, 30, 31]]) tensor([32])
tensor([[ 10,  75, 208]]) tensor([209])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([28])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([249])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([149])
tensor([[ 78,  79, 261, 151,  14, 262, 153]]) tensor([36])
tensor([[  1,   2,   3, 221,   5, 222, 223, 224]]) tensor([225])
tensor([[ 1,  2,  3, 92, 93, 94]]) tensor([95])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([136])
tensor([[1, 2, 3, 4, 5, 8]]) tensor([9])
tensor([[ 10, 140,   3, 141, 142,  12, 143,  83,   3, 144]]) tensor([145])
tensor([[ 10,  11, 157, 158, 159]]) tensor([160])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([72])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([166])


In [52]:
import torch.nn as nn

In [87]:
class SimpleRNN(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50,64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [88]:
learning_rate = 0.001
epochs = 20

In [89]:
model = SimpleRNN(len(vocab))

In [90]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [91]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss
    loss = criterion(output, answer[0])

    # gradient
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 522.694790
Epoch: 2, Loss: 455.272223
Epoch: 3, Loss: 378.604574
Epoch: 4, Loss: 316.661185
Epoch: 5, Loss: 264.791490
Epoch: 6, Loss: 216.398038
Epoch: 7, Loss: 171.419124
Epoch: 8, Loss: 133.644891
Epoch: 9, Loss: 102.814267
Epoch: 10, Loss: 78.752524
Epoch: 11, Loss: 60.014153
Epoch: 12, Loss: 47.352984
Epoch: 13, Loss: 37.936381
Epoch: 14, Loss: 31.122657
Epoch: 15, Loss: 25.746814
Epoch: 16, Loss: 21.779649
Epoch: 17, Loss: 18.921846
Epoch: 18, Loss: 15.766016
Epoch: 19, Loss: 13.644386
Epoch: 20, Loss: 11.880323


In [123]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold :
    print("I dont know")
  else:
    return list(vocab.keys())[index]



In [136]:
predict(model,"king of fruits")

'mango'

In [130]:
df

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron
86,Which superhero is also known as the Dark Knight?,Batman
87,What is the capital of Brazil?,Brasilia
88,Which fruit is known as the king of fruits?,Mango
