<a href="https://colab.research.google.com/github/Redcoder815/Deep_Learning_PyTorch/blob/main/GatedRecurrentUnitNetworks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load the IMDB dataset
max_features = 5000
max_len = 500

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [None]:
# Pad sequences to ensure uniform length
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

# Convert to PyTorch tensors
x_train = torch.tensor(x_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [None]:
# Create DataLoader
batch_size = 64

train_data = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.gru(x)
        x = x[:, -1, :]
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

# Model hyperparameters
vocab_size = max_features
embedding_dim = 128
hidden_dim = 128
output_dim = 1
num_layers = 3

# Initialize the model, criterion and optimizer
gru_model = GRUModel(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    gru_model.train()
    total_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = gru_model(inputs)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

In [None]:
# Evaluate the model
gru_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = gru_model(inputs)
        predicted = (outputs.squeeze() >= 0.5).float()
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Input-to-hidden
        self.W_z = nn.Parameter(torch.randn(hidden_size, input_size) * 0.01)
        self.W_r = nn.Parameter(torch.randn(hidden_size, input_size) * 0.01)
        self.W_h = nn.Parameter(torch.randn(hidden_size, input_size) * 0.01)

        # Hidden-to-hidden
        self.U_z = nn.Parameter(torch.randn(hidden_size, hidden_size) * 0.01)
        self.U_r = nn.Parameter(torch.randn(hidden_size, hidden_size) * 0.01)
        self.U_h = nn.Parameter(torch.randn(hidden_size, hidden_size) * 0.01)

        # Biases
        self.b_z = nn.Parameter(torch.zeros(hidden_size))
        self.b_r = nn.Parameter(torch.zeros(hidden_size))
        self.b_h = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x_t, h_prev):
        # x_t: (batch, input_size)
        # h_prev: (batch, hidden_size)

        z_t = torch.sigmoid(
            x_t @ self.W_z.T + h_prev @ self.U_z.T + self.b_z
        )
        r_t = torch.sigmoid(
            x_t @ self.W_r.T + h_prev @ self.U_r.T + self.b_r
        )

        h_tilde = torch.tanh(
            x_t @ self.W_h.T + (r_t * h_prev) @ self.U_h.T + self.b_h
        )

        h_t = (1 - z_t) * h_prev + z_t * h_tilde
        return h_t

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.cell = GRUCell(input_size, hidden_size)

    def forward(self, x, h0=None):
        # x: (batch, seq_len, input_size)
        batch_size, seq_len, _ = x.shape

        if h0 is None:
            h_t = torch.zeros(batch_size, self.hidden_size, device=x.device)
        else:
            h_t = h0

        outputs = []
        for t in range(seq_len):
            x_t = x[:, t, :]          # (batch, input_size)
            h_t = self.cell(x_t, h_t) # (batch, hidden_size)
            outputs.append(h_t.unsqueeze(1))

        # Concatenate over time
        outputs = torch.cat(outputs, dim=1)  # (batch, seq_len, hidden_size)
        return outputs, h_t

For regression

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

def generate_data(batch_size, seq_len, input_size):
    # Random input sequences
    x = torch.randn(batch_size, seq_len, input_size)

    # Target: cumulative sum across time (many-to-many)
    y = torch.cumsum(x, dim=1)
    return x, y

input_size = 5
hidden_size = 16
seq_len = 20
batch_size = 32
epochs = 200

model = GRU(input_size, hidden_size)
output_layer = nn.Linear(hidden_size, input_size)

criterion = nn.MSELoss()
optimizer = optim.Adam(list(model.parameters()) + list(output_layer.parameters()), lr=0.001)
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# Training loop
# -----------------------------
for epoch in range(epochs):
    x, y = generate_data(batch_size, seq_len, input_size)

    optimizer.zero_grad()

    outputs, h_last = model(x)          # (batch, seq_len, hidden)
    preds = output_layer(outputs)       # (batch, seq_len, input_size)

    loss = criterion(preds, y)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f"Epoch {epoch+1}/{epochs} | Loss: {loss.item():.6f}")


For classification

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

def generate_classification_data(batch_size, seq_len, input_size):
    x = torch.randn(batch_size, seq_len, input_size)

    # Label = 1 if sum > 0, else 0
    seq_sum = x.sum(dim=(1, 2))
    y = (seq_sum > 0).long()

    return x, y

input_size = 4
hidden_size = 32
num_classes = 2
seq_len = 15
batch_size = 64
epochs = 200

model = GRU(input_size, hidden_size)
classifier = nn.Linear(hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(model.parameters()) + list(classifier.parameters()), lr=0.001)
# optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    x, y = generate_classification_data(batch_size, seq_len, input_size)

    optimizer.zero_grad()

    outputs, h_last = model(x)     # outputs: (batch, seq_len, hidden)
    final_state = outputs[:, -1]   # use last hidden state for classification

    logits = classifier(final_state)  # (batch, num_classes)

    loss = criterion(logits, y)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        pred = logits.argmax(dim=1)
        acc = (pred == y).float().mean().item()
        print(f"Epoch {epoch+1}/{epochs} | Loss: {loss.item():.4f} | Acc: {acc:.4f}")

print("Training complete.")

In [None]:
import torch
from torch import nn

In [None]:
class GRUScratch(nn.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        # self.save_hyperparameters() # Removed as it's not a standard nn.Module method without d2l
        self.num_hiddens = num_hiddens # Store num_hiddens as an instance attribute

        init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (init_weight(num_inputs, num_hiddens),
                          init_weight(num_hiddens, num_hiddens),
                          nn.Parameter(torch.zeros(num_hiddens)))
        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state

    def forward(self, inputs, H=None):
        if H is None:
            # Initial state with shape: (batch_size, num_hiddens)
            H = torch.zeros((inputs.shape[1], self.num_hiddens),
                          device=inputs.device)
        outputs = []
        for X in inputs:
            Z = torch.sigmoid(torch.matmul(X, self.W_xz) +
                            torch.matmul(H, self.W_hz) + self.b_z)
            R = torch.sigmoid(torch.matmul(X, self.W_xr) +
                            torch.matmul(H, self.W_hr) + self.b_r)
            H_tilde = torch.tanh(torch.matmul(X, self.W_xh) +
                               torch.matmul(R * H, self.W_hh) + self.b_h)
            H = Z * H + (1 - Z) * H_tilde
            outputs.append(H)
        return outputs, H

In [None]:
class GRUModel(nn.Module):
    def __init__(self, num_inputs, num_hiddens):
        super().__init__()
        self.rnn = GRUScratch(num_inputs, num_hiddens)
        self.fc = nn.Linear(num_hiddens, 1)

    def forward(self, X):
        # X: (batch, seq, input_size)
        X = X.permute(1, 0, 2)  # (seq, batch, input)
        outputs, _ = self.rnn(X)
        outputs = torch.stack(outputs)  # (seq, batch, hidden)
        outputs = outputs.permute(1, 0, 2)  # (batch, seq, hidden)
        return self.fc(outputs)

In [None]:
import torch
from torch import nn
import random

def generate_data(num_sequences=2000, seq_len=20):
    X, Y = [], []
    for _ in range(num_sequences):
        start = random.random()
        seq = [start + i*0.05 for i in range(seq_len+1)]
        X.append(seq[:-1])   # first 20 numbers
        Y.append(seq[1:])    # shifted by 1
    X = torch.tensor(X).float().unsqueeze(-1)  # (batch, seq, 1)
    Y = torch.tensor(Y).float().unsqueeze(-1)
    return X, Y

X, Y = generate_data()
print(X.shape, Y.shape)

In [None]:
model = GRUModel(num_inputs=1, num_hiddens=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

batch_size = 32
num_epochs = 20

for epoch in range(num_epochs):
    perm = torch.randperm(X.size(0))
    total_loss = 0

    for i in range(0, X.size(0), batch_size):
        idx = perm[i:i+batch_size]
        xb, yb = X[idx], Y[idx]

        pred = model(xb)
        loss = loss_fn(pred, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")