<a href="https://colab.research.google.com/github/Redcoder815/Deep_Learning_PyTorch/blob/main/26GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import collections
import random
import re
import torch.nn.functional as F

In [2]:
import requests

class TimeMachine():
    def __init__(self, batch_size = 2, num_steps = 10, num_train=10000, num_val=5000):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_train = num_train
        self.num_val = num_val
        corpus, self.vocab = self.build(self._download())
        array = torch.tensor([corpus[i:i+num_steps+1]
                            for i in range(len(corpus)-num_steps)])
        self.X, self.Y = array[:,:-1], array[:,1:]

    def _download(self):
        url = 'http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt'
        response = requests.get(url)
        response.raise_for_status() # Raise an exception for HTTP errors
        return response.text

    def _preprocess(self, text):
        return re.sub('[^A-Za-z]+', ' ', text).lower()

    def _tokenize(self, text):
        return list(text)

    def build(self, raw_text, vocab=None):
        tokens = self._tokenize(self._preprocess(raw_text))
        if vocab is None: vocab = Vocab(tokens)
        corpus = [vocab[token] for token in tokens]
        return corpus, vocab

    def get_tensorloader(self, data_arrays, train, i):
        # This is a placeholder, actual implementation might vary based on inheritance.
        features = data_arrays[0][i]
        labels = data_arrays[1][i]
        dataset = TensorDataset(features, labels)
        dataloader = DataLoader(dataset, self.batch_size, shuffle=train)
        return dataloader

    def get_dataloader(self, train):
        idx = slice(0, self.num_train) if train else slice(
            self.num_train, self.num_train + self.num_val)
        return self.get_tensorloader([self.X, self.Y], train, idx)

In [3]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set([''] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['']

In [4]:
class RNNLMScratch(nn.Module):
    """The RNN-based language model implemented from scratch."""
    def __init__(self, rnn, vocab_size, lr=0.01):
        super().__init__()
        self.rnn = rnn # Store the RNN instance
        self.vocab_size = vocab_size
        self.loss = nn.CrossEntropyLoss() # Initialize the loss function
        self.init_params()

    def init_params(self):
        self.W_hq = nn.Parameter(
            torch.randn(
                self.rnn.num_hiddens, self.vocab_size) * self.rnn.sigma)
        self.b_q = nn.Parameter(torch.zeros(self.vocab_size))

    def training_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        # self.plot('ppl', torch.exp(l), train=True) # Removed as self.plot is undefined
        return l

    def validation_step(self, batch):
        l = self.loss(self(*batch[:-1]), batch[-1])
        # self.plot('ppl', torch.exp(l), train=False) # Removed as self.plot is undefined

    def one_hot(self, X):
        # Output shape: (num_steps, batch_size, vocab_size)
        return F.one_hot(X.T, self.vocab_size).type(torch.float32)

    def output_layer(self, rnn_outputs):
        outputs = [torch.matmul(H, self.W_hq) + self.b_q for H in rnn_outputs]
        return torch.stack(outputs, 1)

    def forward(self, X, state=None):
        embs = self.one_hot(X)
        rnn_outputs, _ = self.rnn(embs, state)
        return self.output_layer(rnn_outputs)

    def predict(self, prefix, num_preds, vocab, device=None):
        state, outputs = None, [vocab[prefix[0]]]
        for i in range(len(prefix) + num_preds - 1):
            X = torch.tensor([[outputs[-1]]], device=device)
            embs = self.one_hot(X)
            rnn_outputs, state = self.rnn(embs, state)
            if i < len(prefix) - 1:  # Warm-up period
                outputs.append(vocab[prefix[i + 1]])
            else:  # Predict num_preds steps
                Y = self.output_layer(rnn_outputs)
                outputs.append(int(Y.argmax(axis=2).reshape(1)))
        return ''.join([vocab.idx_to_token[i] for i in outputs])

In [5]:
class GRUScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.num_hiddens = num_hiddens # Stored as instance variable
        self.sigma = sigma # Store sigma as an instance attribute

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state

    def forward(self, inputs, H=None):
        if H is None:
            # Initial state with shape: (batch_size, num_hiddens)
            H = torch.zeros((inputs.shape[1], self.num_hiddens),
                          device=inputs.device)
        outputs = []
        for X in inputs:
            Z = torch.sigmoid(torch.matmul(X, self.W_xz) +
                            torch.matmul(H, self.W_hz) + self.b_z)
            R = torch.sigmoid(torch.matmul(X, self.W_xr) +
                            torch.matmul(H, self.W_hr) + self.b_r)
            H_tilde = torch.tanh(torch.matmul(X, self.W_xh) +
                               torch.matmul(R * H, self.W_hh) + self.b_h)
            H = Z * H + (1 - Z) * H_tilde
            outputs.append(H)
        return outputs, H

In [6]:
data = TimeMachine(batch_size=1024, num_steps=32)
gru = GRUScratch(num_inputs=len(data.vocab), num_hiddens=32)
model = RNNLMScratch(gru, vocab_size=len(data.vocab), lr=0.01)

In [7]:
prefix = 'it has a time'
num_preds = 50
predicted_text = model.predict(prefix, num_preds, data.vocab, device=None)
print(predicted_text)

it has a timeukyacjalllntuzrrlllntuzrrlllntuzrrlllntuzrrlllntuz


Concise implementation

The swapaxes(0, 1) operation in the output_layer function is used to reorder the dimensions of the tensor. Let's break it down:

Tensor Shape Before swapaxes: After self.linear(hiddens) is applied, the tensor's shape is typically (sequence_length, batch_size, vocab_size). This means the first dimension represents the time steps (or sequence length), the second is the batch size, and the third is the vocabulary size (logits for each word).

What swapaxes(0, 1) Does: It swaps the dimension at index 0 with the dimension at index 1. So, (sequence_length, batch_size, vocab_size) becomes (batch_size, sequence_length, vocab_size).

Why it's Needed: This reordering is crucial for compatibility with PyTorch's nn.CrossEntropyLoss. When you flatten the target labels Y (which start as (batch_size, sequence_length)) into a 1D tensor (batch_size * sequence_length), the loss function expects the model's output (logits) to also be structured such that when it's flattened to (batch_size * sequence_length, vocab_size), the elements align correctly. By having the batch dimension first in (batch_size, sequence_length, vocab_size), the subsequent flattening output.reshape(-1, output.shape[-1]) correctly produces (batch_size * sequence_length, vocab_size), aligning perfectly with the flattened target labels. This ensures that the loss is calculated for the correct predictions corresponding to their respective targets within each sequence and batch.

In [8]:
class RNNLM(RNNLMScratch):
    """The RNN-based language model implemented with high-level APIs."""
    def __init__(self, rnn, vocab_size, lr=0.01):
        super().__init__(rnn, vocab_size, lr) # Call parent's __init__ to set lr

    def init_params(self):
        # For RNNLM, init_params is overridden by LazyLinear initialization
        # LazyLinear handles weight initialization automatically
        self.linear = nn.LazyLinear(self.vocab_size) # This should be here, not in __init__ for LazyLinear

    def output_layer(self, hiddens):
        return self.linear(hiddens).swapaxes(0, 1)

In [9]:
class GRU(nn.Module):
    def __init__(self, num_inputs, num_hiddens):
        super().__init__()
        # Removed self.save_hyperparameters() as it's not standard nn.Module
        self.num_hiddens = num_hiddens # Store num_hiddens as an instance variable
        self.rnn = nn.GRU(num_inputs, num_hiddens)

    def forward(self, inputs, state=None):
        # nn.GRU expects input shape (seq_len, batch_size, input_size)
        # and returns output (seq_len, batch_size, num_hiddens) and h_n (1, batch_size, num_hiddens)
        # The predict method sends (1, 1, vocab_size) as embs.
        # Permute to (batch_size, seq_len, input_size) for nn.GRU, then back.
        inputs = inputs.permute(1, 0, 2) # Change from (num_steps, batch_size, vocab_size) to (batch_size, num_steps, vocab_size) if needed
        outputs, state = self.rnn(inputs, state)
        outputs = outputs.permute(1, 0, 2) # Change back if needed
        return outputs, state

In [10]:
gru = GRU(num_inputs=len(data.vocab), num_hiddens=128)
model = RNNLM(gru, vocab_size=len(data.vocab), lr=0.01)

In [11]:
def train(model, data, num_epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # Check if a CUDA-enabled GPU is available, otherwise use the CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        ppl_total, num_tokens = 0, 0
        model.train() # Set the model to training mode
        for X, Y in data.get_dataloader(train=True):
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            output = model(X)
            # Ensure Y has the correct shape for CrossEntropyLoss (batch_size * sequence_length)
            output = output.reshape(-1, output.shape[-1])
            Y = Y.reshape(-1)
            loss = model.loss(output, Y)
            loss.backward()
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()

            with torch.no_grad():
                ppl_total += torch.exp(loss) * Y.numel()
                num_tokens += Y.numel()

        print(f'Epoch {epoch + 1}, Perplexity: {ppl_total / num_tokens:.2f}')

In [12]:
lr = 0.001
num_epochs = 100
train(model, data, num_epochs, lr)

Epoch 1, Perplexity: 26.52
Epoch 2, Perplexity: 20.57
Epoch 3, Perplexity: 17.63
Epoch 4, Perplexity: 16.87
Epoch 5, Perplexity: 16.47
Epoch 6, Perplexity: 16.11
Epoch 7, Perplexity: 15.70
Epoch 8, Perplexity: 15.23
Epoch 9, Perplexity: 14.70
Epoch 10, Perplexity: 14.12
Epoch 11, Perplexity: 13.53
Epoch 12, Perplexity: 12.95
Epoch 13, Perplexity: 12.42
Epoch 14, Perplexity: 11.97
Epoch 15, Perplexity: 11.59
Epoch 16, Perplexity: 11.29
Epoch 17, Perplexity: 11.05
Epoch 18, Perplexity: 10.85
Epoch 19, Perplexity: 10.68
Epoch 20, Perplexity: 10.55
Epoch 21, Perplexity: 10.43
Epoch 22, Perplexity: 10.33
Epoch 23, Perplexity: 10.25
Epoch 24, Perplexity: 10.18
Epoch 25, Perplexity: 10.12
Epoch 26, Perplexity: 10.06
Epoch 27, Perplexity: 10.02
Epoch 28, Perplexity: 9.98
Epoch 29, Perplexity: 9.94
Epoch 30, Perplexity: 9.91
Epoch 31, Perplexity: 9.88
Epoch 32, Perplexity: 9.86
Epoch 33, Perplexity: 9.83
Epoch 34, Perplexity: 9.81
Epoch 35, Perplexity: 9.79
Epoch 36, Perplexity: 9.77
Epoch 37, 

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.predict('it has', 20, data.vocab, device)

'it has the the the the the'