In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

In [127]:
device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [174]:
df= pd.read_csv(r"/content/sample_data/100_Unique_QA_Dataset.csv")
df.shape

(90, 2)

In [4]:
df.iloc[7]

Unnamed: 0,7
question,What is the chemical symbol for gold?
answer,Au


In [5]:

def tokenize(a):
  txt=a.lower()
  txt=txt.replace("?","")
  txt=txt.replace("'","")
  return txt.split()

In [6]:
df['question_token'] = df['question'].apply(tokenize)
df['answer_token']=df['answer'].apply(tokenize)

In [7]:
df['merged']=df['question_token']+df['answer_token']
df

Unnamed: 0,question,answer,question_token,answer_token,merged
0,What is the capital of France?,Paris,"[what, is, the, capital, of, france]",[paris],"[what, is, the, capital, of, france, paris]"
1,What is the capital of Germany?,Berlin,"[what, is, the, capital, of, germany]",[berlin],"[what, is, the, capital, of, germany, berlin]"
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee,"[who, wrote, to, kill, a, mockingbird]",[harper-lee],"[who, wrote, to, kill, a, mockingbird, harper-..."
3,What is the largest planet in our solar system?,Jupiter,"[what, is, the, largest, planet, in, our, sola...",[jupiter],"[what, is, the, largest, planet, in, our, sola..."
4,What is the boiling point of water in Celsius?,100,"[what, is, the, boiling, point, of, water, in,...",[100],"[what, is, the, boiling, point, of, water, in,..."
...,...,...,...,...,...
85,Who directed the movie 'Titanic'?,JamesCameron,"[who, directed, the, movie, titanic]",[jamescameron],"[who, directed, the, movie, titanic, jamescame..."
86,Which superhero is also known as the Dark Knight?,Batman,"[which, superhero, is, also, known, as, the, d...",[batman],"[which, superhero, is, also, known, as, the, d..."
87,What is the capital of Brazil?,Brasilia,"[what, is, the, capital, of, brazil]",[brasilia],"[what, is, the, capital, of, brazil, brasilia]"
88,Which fruit is known as the king of fruits?,Mango,"[which, fruit, is, known, as, the, king, of, f...",[mango],"[which, fruit, is, known, as, the, king, of, f..."


In [8]:
vocab={"<un>":0}
for i in df['merged']:

  for j in i:
    if j not in vocab:
      vocab[j]=len(vocab)



In [35]:
def embedding(a,vocab):
  arr=tokenize(a)
  emb=[]
  for i in arr:
    if i not in vocab:
      emb.append(vocab["<un>"])
    else:
      emb.append(vocab[i])
  return emb

In [36]:
embedding(df.iloc[7]['question'],vocab)

[1, 2, 3, 37, 38, 39, 40]

In [100]:
class custom_dataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab
  def __len__(self):
    return len(self.df)
  def __getitem__(self,idx):
    question=torch.tensor(embedding(self.df.iloc[idx]['question'],self.vocab))
    answer=torch.tensor(embedding(self.df.iloc[idx]['answer'],self.vocab))
    return question,answer

In [101]:
vocab_len = len(vocab)
vocab_len
data = custom_dataset(df,vocab)


In [117]:
import torch.nn.functional as F

# Define the maximum sequence length
MAX_LENGTH = vocab_len # Pad all sequences to this length

# Custom collate function with fixed length and padding direction
def collate_fn(batch):
    # Unpack the batch into inputs (x) and labels (y), convert tuples to lists
    x, y = list(zip(*batch))  # Convert tuple to list

    # Convert each sequence to a tensor
    x_tensors = [torch.tensor(seq) for seq in x]

    # Pad each sequence to MAX_LENGTH
    x_padded = []
    for seq in x_tensors:
        seq_len = seq.size(0)  # Length of the current sequence
        if seq_len < MAX_LENGTH:
            # Calculate padding needed
            padding_size = MAX_LENGTH - seq_len

            # Right padding: pad at the end
            padded_seq = F.pad(seq, (padding_size,0), value=0)

            # For left padding, uncomment the line below instead
            # padded_seq = F.pad(seq, (padding_size, 0), value=0)

            x_padded.append(padded_seq)
        else:
            # If sequence is longer than MAX_LENGTH, truncate it
            x_padded.append(seq[:MAX_LENGTH])

    # Stack the padded sequences into a batch
    x_padded = torch.stack(x_padded)

    # Convert y to a tensor
    y = torch.tensor(y)

    return x_padded, y

# Create the DataLoader with the custom collate function and a larger batch size
dataloader = DataLoader(data, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [118]:
dataloader = DataLoader(data, batch_size=2, shuffle=True,collate_fn=collate_fn)

In [120]:
x,y = next(iter(dataloader))
print(x)
print(x.shape,y.shape)

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0

  x_tensors = [torch.tensor(seq) for seq in x]


324

### IN THE MODEL CLASS WE CAN't USE SEQUENTIAL BECAUSE IT EXPECTS ONE OUTPUT OF EACH OF PREVIOUS LAYERS BUT RNN GIVES 2 OUTPUTS HIDDEN STATE AND OUTPUT OF HIDDEN LAYER BOTH

### ALSO WE CONVERTING INPUT TO BE 50 DIMENSIONAL NEURON SO EACH WORD WILL HAVE 50 DIMENSIONAL NEURON

In [121]:
embedding3 = nn.Embedding(vocab_len,50)
for i in range(10):
  x,y = next(iter(dataloader))
  print(x.shape,y.shape,end=" : ")

  print(embedding3(x).shape)

torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])
torch.Size([2, 324]) torch.Size([2]) : torch.Size([2, 324, 50])


  x_tensors = [torch.tensor(seq) for seq in x]


In [122]:
class Model(nn.Module):
  def __init__(self,vocab_len):
    super(Model,self).__init__()
    self.embedding=nn.Embedding(vocab_len,50)
    self.rnn=nn.RNN(50,128,batch_first=True)
    self.fc=nn.Linear(128,vocab_len)
  def forward(self,x):
    x=self.embedding(x)
    x,_=self.rnn(x)
    x=self.fc(_.squeeze(0))
    return x

In [160]:
model = Model(vocab_len).to(device)


In [161]:

loss = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)

In [162]:
for i in range(1000):
  net_loss=0
  for x,y in dataloader:
    x,y=x.to(device),y.to(device)
    output = model(x)

    optim.zero_grad()

    l= loss(output,y)

    l.backward()

    optim.step()
    net_loss+=l.item()
  #print(f"loss after epoch:{i} is {net_loss/len(x)}")


  x_tensors = [torch.tensor(seq) for seq in x]


In [167]:
def model_output(ques):

  ques_embedding = torch.tensor(embedding(ques,vocab)).to(device)
  padding=torch.tensor([0]*(vocab_len-len(ques_embedding))).to(device)
  ques_tensor = torch.cat([padding,torch.tensor(ques_embedding)],dim=0) # unsqueeze to match size as in dataloader x,y
  pred =model(ques_tensor.unsqueeze(0)) # unsqueeze to add a dimension of batch
  output = torch.argmax(pred,dim=1)
  for i in vocab:

    if vocab[i]==output:
      return i
      break

In [172]:
print(model_output("what is the capital of US?"))

tokyo
