In [29]:
'''
Name   : Rupesh Garsondiya
gtihub : @Rupeshgarsondiya
Tpoic  : Tpoic Recurrent Neural Network (RNN) Using PyTorch
'''

'\nName   : Rupesh Garsondiya\ngtihub : @Rupeshgarsondiya\nTpoic  : Tpoic Recurrent Neural Network (RNN) Using PyTorch\n'

In [60]:
# import required library
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


In [61]:
# Read the data
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


# **Tokenization of the dataset**

In [62]:
def tokenize(text):
    """
    Tokenizes a given text by converting it to lowercase,
    removing specific punctuation ('?' and "'"),
    and splitting it into a list of words.

    Args:
        text (str): The input string to be tokenized.

    Returns:
        list: A list of words from the processed text.
    """

    # Convert text to lowercase
    text = text.lower()

    # Remove question marks
    text = text.replace("?", "")

    # Remove apostrophes
    text = text.replace("'", "")

    # Split text into a list of words and return
    return text.split()


In [63]:
tokenize(df['question'][0])

['what', 'is', 'the', 'capital', 'of', 'france']

# **Create a vocablaory**

In [94]:
# Create the vocabulary dictionary with a special token for unknown words
vocab = {'<UNK>': 0}

def build_vocab(raw):
    """
    Builds a vocabulary dictionary from the given raw text data.

    Args:
        raw (dict): A dictionary containing 'question' and 'answer' keys,
                    where values are text strings.

    Returns:
        None: Updates the global vocab dictionary in place.
    """

    # Tokenize the question and answer
    tokenized_question = tokenize(raw['question'])
    tokenized_answer = tokenize(raw['answer'])

    # Merge tokens from both question and answer
    merged_tokens = tokenized_question + tokenized_answer

    # Add new tokens to the vocabulary
    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)







In [95]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [96]:
print('='*20,'Vocab','='*20)
vocab



{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [98]:
# Function to convert words in a text to their corresponding numerical indices based on a given vocabulary.

def text_to_indices(text, vocab):
    """
    Converts a given text into a list of numerical indices using a provided vocabulary.

    Args:
        text (str): The input text to be tokenized and converted.
        vocab (dict): A dictionary mapping words to their corresponding indices.
                      It should contain a special token '<UNK>' for unknown words.

    Returns:
        list: A list of numerical indices representing the words in the text.
    """

    indexed_text = []  # List to store the numerical indices of words in the text.

    for token in tokenize(text):  # Tokenize the input text.
        if token in vocab:  # Check if the token exists in the vocabulary.
            indexed_text.append(vocab[token])  # Append the corresponding index.
        else:
            indexed_text.append(vocab['<UNK>'])  # Use '<UNK>' index for unknown words.

    return indexed_text  # Return the list of numerical indices.


In [99]:
text_to_indices('how are you',vocab)

[78, 81, 0]

# **Dataloader class**

In [100]:
from torch.utils.data import Dataset
import torch

class QADataset(Dataset):
    """
    A custom PyTorch dataset for handling Question-Answer pairs.

    Attributes:
        df (pd.DataFrame): DataFrame containing questions and answers.
        vocab (dict): A dictionary mapping words to their corresponding indices.
    """

    def __init__(self, df, vocab):
        """
        Initializes the QADataset with a DataFrame and a vocabulary dictionary.

        Args:
            df (pd.DataFrame): A DataFrame containing 'question' and 'answer' columns.
            vocab (dict): A dictionary mapping words to indices for numerical representation.
        """
        self.df = df
        self.vocab = vocab

    def __len__(self):
        """
        Returns the total number of samples in the dataset.

        Returns:
            int: The number of rows in the DataFrame.
        """
        return len(self.df)

    def __getitem__(self, index):
        """
        Retrieves the numerical representation of a question-answer pair at the given index.

        Args:
            index (int): The index of the sample to fetch.

        Returns:
            tuple: A tuple containing:
                - torch.Tensor: Numerical representation of the question.
                - torch.Tensor: Numerical representation of the answer.
        """
        numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

        return torch.tensor(numerical_question), torch.tensor(numerical_answer)

# Potential issue:
# The code snippet contains an extra segment after `return indexed_text`,
# which seems to be misplaced. Ensure the function text_to_indices is defined properly.


In [70]:
dataset = QADataset(df,vocab)
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [71]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True) # if we use more than one batch then we need to use the padding beacuse length of the word is the diffrent

In [72]:
for question, answer in dataloader:

  print(question,answer)

tensor([[ 42, 216, 118, 217, 218,  19,  14, 219,  43]]) tensor([[220]])
tensor([[ 1,  2,  3, 33, 34,  5, 35]]) tensor([[36]])
tensor([[  1,   2,   3,   4,   5, 279]]) tensor([[280]])
tensor([[ 10,  75, 111]]) tensor([[112]])
tensor([[ 78,  79, 150, 151,  14, 152, 153]]) tensor([[154]])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([[246]])
tensor([[ 42, 137, 118,   3, 247,   5, 248]]) tensor([[249]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[ 42, 137,   2, 138,  39, 139]]) tensor([[53]])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([[72]])
tensor([[78, 79, 80, 81, 82, 83, 84]]) tensor([[85]])
tensor([[ 1,  2,  3,  4,  5, 99]]) tensor([[100]])
tensor([[ 42,   2,   3, 274, 211, 275]]) tensor([[276]])
tensor([[  1,   2,   3, 141, 117,  83,   3, 277, 278]]) tensor([[121]])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([[58]])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([[52]])
tensor([[ 1,  2,  3, 24, 25,  5, 26, 19, 27]]) tensor([[28]])
tens

# **Architechture**

In [110]:
import torch
import torch.nn as nn

class SimpleRNN(nn.Module):
    """
    A Simple Recurrent Neural Network (RNN) model for text processing.

    This model consists of an embedding layer, an RNN layer, and a fully connected layer.
    The output of the model is logits, which can be converted to probabilities using softmax.
    """

    def __init__(self, vocab_size):
        """
        Initializes the SimpleRNN model.

        Args:
            vocab_size (int): The size of the vocabulary.
        """
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)  # Output layer mapping to vocabulary size

        # Note: The final Linear layer outputs raw logits.
        # To get probabilities, apply softmax activation externally.

    def forward(self, x):
        """
        Defines the forward pass of the model.

        Args:
            x (torch.Tensor): Input tensor of token indices.

        Returns:
            torch.Tensor: Logits for each token in the vocabulary.
        """
        embedded_question = self.embedding(x)  # Convert input indices to embeddings

        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))

        return output

    def __getitem__(self, index):
        """
        Retrieves a numerical representation of a question-answer pair.

        Args:
            index (int): Index of the question-answer pair.

        Returns:
            tuple: (tensor of numerical question, tensor of numerical answer)
        """
        numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

        return torch.tensor(numerical_question), torch.tensor(numerical_answer)


def text_to_indices(text, vocab):
    """
    Converts a text string into a list of numerical indices based on the vocabulary.

    Args:
        text (str): The input text to convert.
        vocab (dict): A dictionary mapping words to indices.

    Returns:
        list: A list of indices representing the text.
    """
    indexed_text = []
    for token in text.split():
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])  # Use <UNK> for unknown words

    return indexed_text


In [111]:
dataset[15]

(tensor([ 0,  2,  3, 69,  5,  3,  0,  0]), tensor([0]))

In [112]:
e = nn.Embedding(324,embedding_dim=50)
a = e(dataset[0][0])

In [113]:
y = nn.RNN(50,64)

In [114]:
"""
This script prints the hidden state output shape and values from a network.
It manually executes the forward pass and does not use a Sequential container
due to the feedback nature of the network.

Key Observations:
- The output consists of two parts: hidden state and the final output.
- The last hidden state and the final output are the same.
"""

# Print the shape of the hidden state output
print('=' * 27, ' Hidden State Output Shape ', '=' * 27)
print(y(a)[0].shape)  # First element of the output (tuple), representing hidden state shape
print('=' * 30, ' Hidden State Output Shape ', '=' * 25)
print(y(a)[1].shape)  # Second element of the output, representing the final output shape

# Print the hidden state output values
print('=' * 30, ' Hidden State Output ', '=' * 30)
print(y(a)[0])  # Hidden state output values
print('=' * 20, ' Hidden State Output ', '=' * 27)
output = y(a)[1]  # Storing the final output
output  # Displaying the final output

# Notes:
# - The feedback nature of this network requires manually writing the forward pass.
# - We do not use a Sequential container for this architecture.
# - The output of the last hidden layer and the final output are the same.


torch.Size([6, 64])
torch.Size([1, 64])
tensor([[-1.1723e-01,  3.3080e-02, -5.7309e-02,  5.4739e-01, -1.7886e-01,
          1.8129e-02, -8.1863e-03, -1.1049e-01, -5.7209e-01, -5.7035e-01,
         -4.6142e-01, -6.4173e-02,  2.4251e-01, -2.8623e-01,  1.3863e-01,
          3.8866e-02,  6.0703e-01, -1.8687e-01, -1.4840e-01,  4.2552e-01,
         -7.1063e-01,  7.8312e-01,  2.2086e-01,  5.2961e-01, -2.3210e-01,
          4.6716e-03,  4.0326e-01, -1.9379e-01, -5.6440e-01,  4.5493e-01,
          2.2584e-01,  9.4899e-02,  4.4309e-01,  1.2270e-01, -3.6648e-01,
         -3.1713e-01,  6.8412e-02,  7.6928e-02, -6.0199e-01,  5.1746e-01,
         -5.3260e-03,  8.1868e-01, -3.8868e-01, -1.8287e-01,  6.6645e-02,
         -5.9755e-02,  1.9545e-01,  2.5647e-01, -2.1768e-01, -1.9044e-01,
          4.7257e-01, -5.3656e-02,  4.6672e-01, -1.8141e-01, -1.8896e-01,
          2.3128e-01,  8.8005e-04, -1.4929e-01,  4.4917e-01, -4.9151e-01,
          7.5251e-01,  3.2529e-01,  5.0826e-01, -4.8637e-02],
        [-

tensor([[-0.6626, -0.0105, -0.0543,  0.1708,  0.1435, -0.1421, -0.2183, -0.0015,
         -0.4064, -0.4972, -0.7753, -0.1403,  0.0886, -0.6049, -0.1813, -0.4105,
          0.4316, -0.4847, -0.2312,  0.2866, -0.5737,  0.7020,  0.3530,  0.4802,
          0.2647, -0.0457,  0.6612, -0.5083, -0.5438, -0.1423, -0.5416, -0.0896,
          0.6976, -0.1885, -0.4412, -0.4281,  0.3632,  0.0774, -0.3258,  0.4978,
         -0.1353,  0.7229, -0.4491,  0.0798,  0.1572,  0.2171, -0.0037,  0.4278,
         -0.3789,  0.2931,  0.0483,  0.1324,  0.5577, -0.1947,  0.4666, -0.0600,
          0.0784, -0.1828,  0.6205, -0.3140,  0.8450,  0.5443,  0.6414,  0.1618]],
       grad_fn=<SqueezeBackward1>)

In [115]:
z = nn.Linear(64,324)



In [116]:
# create model object for training
model = SimpleRNN(len(vocab))



In [117]:
learning_rate = 0.001
epochs = 20

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=learning_rate)


# **Training Loop**

In [118]:
# Define the training loop
for epoch in range(epochs):

  total_loss = 0

  for question,answer in dataloader:

    optimizer.zero_grad()

    output = model(question)

    loss = criterion(output,answer[0])

    loss.backward()

    optimizer.step()

    total_loss += loss.item()

  print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}') # This is an total loss of epochs


Epoch 1/20, Loss: 2.379574296043979
Epoch 2/20, Loss: 1.0077966603140036
Epoch 3/20, Loss: 0.8689027818540732
Epoch 4/20, Loss: 0.7714404246459404
Epoch 5/20, Loss: 0.6651000661568509
Epoch 6/20, Loss: 0.5837660975754261
Epoch 7/20, Loss: 0.48311413021551236
Epoch 8/20, Loss: 0.4130003226714002
Epoch 9/20, Loss: 0.35197212087611357
Epoch 10/20, Loss: 0.3141734646012386
Epoch 11/20, Loss: 0.2689311168984406
Epoch 12/20, Loss: 0.22747464135496154
Epoch 13/20, Loss: 0.19076458760537207
Epoch 14/20, Loss: 0.16739897926131056
Epoch 15/20, Loss: 0.14944415319090087
Epoch 16/20, Loss: 0.12765893431432132
Epoch 17/20, Loss: 0.11107314917414139
Epoch 18/20, Loss: 0.0947904456846623
Epoch 19/20, Loss: 0.08452641809255712
Epoch 20/20, Loss: 0.07773717862760855


# **Inference**

In [121]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])


In [123]:
predict(model,"What is the capital of France?")

<UNK>
