<a href="https://colab.research.google.com/github/Redcoder815/Deep_Learning_PyTorch/blob/main/28BidirectionalRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import collections
import random
import re
import torch.nn.functional as F

In [2]:
import requests

class TimeMachine():
    def __init__(self, batch_size = 2, num_steps = 10, num_train=10000, num_val=5000):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        corpus, self.vocab = self.build(self._download())
        array = torch.tensor([corpus[i:i+num_steps+1]
                            for i in range(len(corpus)-num_steps)])
        self.X, self.Y = array[:,:-1], array[:,1:]

    def _download(self):
        url = 'http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt'
        response = requests.get(url)
        response.raise_for_status() # Raise an exception for HTTP errors
        return response.text

    def _preprocess(self, text):
        return re.sub('[^A-Za-z]+', ' ', text).lower()

    def _tokenize(self, text):
        return list(text)

    def build(self, raw_text, vocab=None):
        tokens = self._tokenize(self._preprocess(raw_text))
        if vocab is None: vocab = Vocab(tokens)
        corpus = [vocab[token] for token in tokens]
        return corpus, vocab

    def get_tensorloader(self, data_arrays, train, i):
        # This is a placeholder, actual implementation might vary based on inheritance.
        features = data_arrays[0][i]
        labels = data_arrays[1][i]
        dataset = TensorDataset(features, labels)
        dataloader = DataLoader(dataset, self.batch_size, shuffle=train)
        return dataloader

    def get_dataloader(self, train):
        idx = slice(0, self.num_train) if train else slice(
            self.num_train, self.num_train + self.num_val)
        return self.get_tensorloader([self.X, self.Y], train, idx)

In [3]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set([''] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['']


In [4]:
class RNNLMScratch(nn.Module):
    """The RNN-based language model implemented from scratch."""
    def __init__(self, rnn, vocab_size, lr=0.01):
        super().__init__()
        self.rnn = rnn # Store the RNN instance
        self.vocab_size = vocab_size
        self.loss = nn.CrossEntropyLoss() # Initialize the loss function
        self.init_params()

    def init_params(self):
        self.W_hq = nn.Parameter(
            torch.randn(
                self.rnn.num_hiddens, self.vocab_size) * self.rnn.sigma)
        self.b_q = nn.Parameter(torch.zeros(self.vocab_size))

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        # self.plot('ppl', torch.exp(l), train=True) # Removed as self.plot is undefined
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        # self.plot('ppl', torch.exp(l), train=False) # Removed as self.plot is undefined

    def one_hot(self, X):
        # Output shape: (num_steps, batch_size, vocab_size)
        return F.one_hot(X.T, self.vocab_size).type(torch.float32)

    def output_layer(self, rnn_outputs):
        outputs = [torch.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs]
        return torch.stack(outputs, 1)

    def forward(self, X, state=None):
        embs = self.one_hot(X)
        rnn_outputs, _ = self.rnn(embs, state)
        return self.output_layer(rnn_outputs)

    def predict(self, prefix, num_preds, vocab, device=None):
        state, outputs = None, [vocab[prefix[0]]]
        for i in range(len(prefix) + num_preds - 1):
            X = torch.tensor([[outputs[-1]]], device=device)
            embs = self.one_hot(X)
            rnn_outputs, state = self.rnn(embs, state)
            if i < len(prefix) - 1:  # Warm-up period
                outputs.append(vocab[prefix[i + 1]])
            else:  # Predict num_preds steps
                Y = self.output_layer(rnn_outputs)
                outputs.append(int(Y.argmax(axis=2).reshape(1)))
        return ''.join([vocab.idx_to_token[i] for i in outputs])

In [5]:
class RNNScratch(nn.Module):
    """The RNN model implemented from scratch."""
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_hiddens = num_hiddens
        self.sigma = sigma # Store sigma as an instance attribute
        self.W_xh = nn.Parameter(
            torch.randn(num_inputs, num_hiddens) * sigma)
        self.W_hh = nn.Parameter(
            torch.randn(num_hiddens, num_hiddens) * sigma)
        self.b_h = nn.Parameter(torch.zeros(num_hiddens))

    def forward(self, inputs, state=None):
        if state is None:
            # Initial state with shape: (batch_size, num_hiddens)
            state = torch.zeros((inputs.shape[1], self.num_hiddens),
                              device=inputs.device)
        else:
            state, = state
        outputs = []
        for X in inputs:  # Shape of inputs: (num_steps, batch_size, num_inputs)
            state = torch.tanh(torch.matmul(X, self.W_xh) +
                             torch.matmul(state, self.W_hh) + self.b_h)
            outputs.append(state)
        return outputs, state

Let's break down torch.cat((f, b), -1) with an example.

torch.cat() is a PyTorch function used to concatenate (join) a sequence of tensors along a given dimension. The (f, b) part means you are concatenating two tensors, f and b.

The -1 specifies the dimension along which to concatenate. In PyTorch, -1 refers to the last dimension of the tensor. For example:

If you have a 1D tensor (a vector), -1 is the 0th dimension.
If you have a 2D tensor (a matrix), -1 is the 1st dimension (columns).
If you have a 3D tensor, -1 is the 2nd dimension.
In the context of the BiRNNScratch class, f represents the forward hidden state and b represents the backward hidden state. Both f and b would typically have the shape (batch_size, num_hiddens) for a single timestep.

Example:

Let's assume:

f is a tensor representing the forward hidden state for a batch of 2, with 64 hidden units: f = torch.randn(2, 64) (shape: [2, 64])
b is a tensor representing the backward hidden state for the same batch and number of hidden units: b = torch.randn(2, 64) (shape: [2, 64])
When you call torch.cat((f, b), -1):

Input Tensors: f and b both have shape [2, 64]. The dimensions are 0 (batch size) and 1 (hidden units).
Concatenation Dimension: -1 refers to the last dimension, which is dimension 1 (the hidden units dimension).
Operation: PyTorch will join f and b along dimension 1. The elements of b will be appended to the elements of f along this dimension.
Resulting Tensor:

The output tensor will have a shape of [2, 128]. The batch size (dimension 0) remains the same (2), but the last dimension (dimension 1) becomes the sum of the corresponding dimensions of f and b (64 + 64 = 128).

Essentially, it combines the forward and backward hidden states for each item in the batch, creating a richer representation that captures information from both directions of the sequence.

In [6]:
class BiRNNScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_hiddens = num_hiddens # Initialize num_hiddens
        self.sigma = sigma # Store sigma as an instance attribute
        self.f_rnn = RNNScratch(num_inputs, num_hiddens, sigma)
        self.b_rnn = RNNScratch(num_inputs, num_hiddens, sigma)
        self.num_hiddens *= 2  # The output dimension will be doubled

    def forward(self, inputs, Hs=None):
        f_H, b_H = Hs if Hs is not None else (None, None)
        f_outputs, f_H = self.f_rnn(inputs, f_H)
        b_outputs, b_H = self.b_rnn(reversed(inputs), b_H)
        outputs = [torch.cat((f, b), -1) for f, b in zip(
            f_outputs, reversed(b_outputs))]
        return outputs, (f_H, b_H)

In [7]:
data = TimeMachine(batch_size=1024, num_steps=32)
rnn_block = BiRNNScratch(num_inputs=len(data.vocab),
                              num_hiddens=128)
model = RNNLMScratch(rnn_block, vocab_size=len(data.vocab), lr=0.001)
model

RNNLMScratch(
  (rnn): BiRNNScratch(
    (f_rnn): RNNScratch()
    (b_rnn): RNNScratch()
  )
  (loss): CrossEntropyLoss()
)

In [8]:
def train(model, data, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # Check if a CUDA-enabled GPU is available, otherwise use the CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        ppl_total, num_tokens = 0, 0
        model.train() # Set the model to training mode
        for X, Y in data.get_dataloader(train=True):
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            output = model(X)
            # Ensure Y has the correct shape for CrossEntropyLoss (batch_size * sequence_length)
            output = output.reshape(-1, output.shape[-1])
            Y = Y.reshape(-1)
            loss = model.loss(output, Y)
            loss.backward()
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()

            with torch.no_grad():
                ppl_total += torch.exp(loss) * Y.numel()
                num_tokens += Y.numel()

        print(f'Epoch {epoch + 1}, Perplexity: {ppl_total / num_tokens:.2f}')

In [9]:
lr = 0.001
num_epochs = 300
train(model, data, num_epochs, lr)

Epoch 1, Perplexity: 27.39
Epoch 2, Perplexity: 20.47
Epoch 3, Perplexity: 17.24
Epoch 4, Perplexity: 16.13
Epoch 5, Perplexity: 14.31
Epoch 6, Perplexity: 11.67
Epoch 7, Perplexity: 8.93
Epoch 8, Perplexity: 6.75
Epoch 9, Perplexity: 5.15
Epoch 10, Perplexity: 3.99
Epoch 11, Perplexity: 3.15
Epoch 12, Perplexity: 2.57
Epoch 13, Perplexity: 2.15
Epoch 14, Perplexity: 1.85
Epoch 15, Perplexity: 1.64
Epoch 16, Perplexity: 1.49
Epoch 17, Perplexity: 1.38
Epoch 18, Perplexity: 1.31
Epoch 19, Perplexity: 1.25
Epoch 20, Perplexity: 1.21
Epoch 21, Perplexity: 1.18
Epoch 22, Perplexity: 1.16
Epoch 23, Perplexity: 1.15
Epoch 24, Perplexity: 1.13
Epoch 25, Perplexity: 1.13
Epoch 26, Perplexity: 1.12
Epoch 27, Perplexity: 1.11
Epoch 28, Perplexity: 1.11
Epoch 29, Perplexity: 1.11
Epoch 30, Perplexity: 1.10
Epoch 31, Perplexity: 1.10
Epoch 32, Perplexity: 1.10
Epoch 33, Perplexity: 1.10
Epoch 34, Perplexity: 1.09
Epoch 35, Perplexity: 1.09
Epoch 36, Perplexity: 1.09
Epoch 37, Perplexity: 1.09
Epoc

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.predict('the', 20, data.vocab, device)

'thehehehehehehehehehehe'