In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

In [3]:
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [45]:
df= pd.read_csv(r"/content/sample_data/100_Unique_QA_Dataset.csv")

In [46]:
df.iloc[7]

Unnamed: 0,7
question,What is the chemical symbol for gold?
answer,Au


In [58]:

def tokenize(a):
  txt=a.lower()
  txt=txt.replace("?","")
  txt=txt.replace("'","")
  return txt.split()

In [48]:
df['question_token'] = df['question'].apply(tokenize)
df['answer_token']=df['answer'].apply(tokenize)

In [57]:
df['merged']=df['question_token']+df['answer_token']
df

Unnamed: 0,question,answer,question_token,answer_token,merged
0,What is the capital of France?,Paris,"[what, is, the, capital, of, france]",[paris],"[what, is, the, capital, of, france, paris]"
1,What is the capital of Germany?,Berlin,"[what, is, the, capital, of, germany]",[berlin],"[what, is, the, capital, of, germany, berlin]"
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[who, wrote, to, kill, a, mockingbird]",[harper-lee],"[who, wrote, to, kill, a, mockingbird, harper-..."
3,What is the largest planet in our solar system?,Jupiter,"[what, is, the, largest, planet, in, our, sola...",[jupiter],"[what, is, the, largest, planet, in, our, sola..."
4,What is the boiling point of water in Celsius?,100,"[what, is, the, boiling, point, of, water, in,...",[100],"[what, is, the, boiling, point, of, water, in,..."
...,...,...,...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron,"[who, directed, the, movie, titanic]",[jamescameron],"[who, directed, the, movie, titanic, jamescame..."
86,Which superhero is also known as the Dark Knight?,Batman,"[which, superhero, is, also, known, as, the, d...",[batman],"[which, superhero, is, also, known, as, the, d..."
87,What is the capital of Brazil?,Brasilia,"[what, is, the, capital, of, brazil]",[brasilia],"[what, is, the, capital, of, brazil, brasilia]"
88,Which fruit is known as the king of fruits?,Mango,"[which, fruit, is, known, as, the, king, of, f...",[mango],"[which, fruit, is, known, as, the, king, of, f..."


In [63]:
vocab={"<un>":0}
for i in df['merged']:

  for j in i:
    if j not in vocab:
      vocab[j]=len(vocab)



In [64]:
def embedding(a,vocab):
  arr=tokenize(a)
  emb=[]
  for i in arr:
    if i not in vocab:
      emb.append(vocab["<un>"])
    else:
      emb.append(vocab[i])
  return emb

In [65]:
embedding(df.iloc[7]['question'],vocab)

[1, 2, 3, 37, 38, 39, 40]

In [95]:
class custom_dataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab
  def __len__(self):
    return len(self.df)
  def __getitem__(self,idx):
    question=torch.tensor(embedding(self.df.iloc[idx]['question'],self.vocab))
    answer=torch.tensor(embedding(self.df.iloc[idx]['answer'],self.vocab))
    return question,answer

In [96]:
data = custom_dataset(df,vocab)

In [99]:
dataloader = DataLoader(data, batch_size=1, shuffle=True)

In [109]:
for x,y in dataloader:
  print(x,y)
  break

tensor([[ 42, 255,   2, 256,  83, 257, 258]]) tensor([[259]])


In [110]:
vocab_len = len(vocab)
vocab_len

324

### IN THE MODEL CLASS WE CAN't USE SEQUENTIAL BECAUSE IT EXPECTS ONE OUTPUT OF EACH OF PREVIOUS LAYERS BUT RNN GIVES 2 OUTPUTS HIDDEN STATE AND OUTPUT OF HIDDEN LAYER BOTH

### ALSO WE CONVERTING INPUT TO BE 50 DIMENSIONAL NEURON SO EACH WORD WILL HAVE 50 DIMENSIONAL NEURON

In [122]:
class Model(nn.Module):
  def __init__(self,vocab_len):
    super(Model,self).__init__()
    self.embedding=nn.Embedding(vocab_len,50)
    self.rnn=nn.RNN(50,128,batch_first=True)
    self.fc=nn.Linear(128,vocab_len)
  def forward(self,x):
    x=self.embedding(x)
    x,_=self.rnn(x)
    x=self.fc(_.squeeze(0))
    return x

In [123]:
model = Model(vocab_len)


In [124]:

loss = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)

In [125]:
for i in range(100):
  net_loss=0
  for x,y in dataloader:
    output = model(x)

    optim.zero_grad()

    l= loss(output,y[0])

    l.backward()

    optim.step()
    net_loss+=l.item()
  print(f"loss after epoch:{i} is {net_loss/len(x)}")


loss after epoch:0 is 521.2499489784241
loss after epoch:1 is 400.56620836257935
loss after epoch:2 is 287.39556217193604
loss after epoch:3 is 199.5228806734085
loss after epoch:4 is 127.20970970392227
loss after epoch:5 is 77.0858274102211
loss after epoch:6 is 49.52288277447224
loss after epoch:7 is 31.34895572811365
loss after epoch:8 is 21.712792798876762
loss after epoch:9 is 15.331237435340881
loss after epoch:10 is 11.307553939521313
loss after epoch:11 is 8.871322572231293
loss after epoch:12 is 7.092687334865332
loss after epoch:13 is 5.799841038882732
loss after epoch:14 is 4.8737614545971155
loss after epoch:15 is 4.1662582121789455
loss after epoch:16 is 3.613013068214059
loss after epoch:17 is 3.131361050531268
loss after epoch:18 is 2.7560192178934813
loss after epoch:19 is 2.4514422304928303
loss after epoch:20 is 2.1889780880883336
loss after epoch:21 is 1.9664762672036886
loss after epoch:22 is 1.7726791761815548
loss after epoch:23 is 1.6049312315881252
loss after ep

In [143]:
ques = "what is symbol of gold?"

In [144]:

ques_embedding = embedding(ques,vocab)
ques_tensor = torch.tensor(ques_embedding).unsqueeze(0) # unsqueeze to match size as in dataloader x,y
pred =model(ques_tensor)
output = torch.argmax(pred,dim=1)
for i in vocab:

  if vocab[i]==output:
    print(i,end=" ")
    break

au 