## Overall Project Structure


### We’re going to create a mini-projects:

- Seq2Seq with Attention for Machine Translation

#### project will cover:

- Dataset selection and preparation 

- Model building

- Training and evaluation

## Reusing this code for lstm and gru and setting  complete pipeline

In [8]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn #for loss fn and neural network
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import math
from tqdm import tqdm


In [9]:
# some parameters

batch_size=64
embedding_dim = 100
hidden_dim = 64
num_layers = 2 
num_epochs = 10

In [10]:
#dataset 
class WordDataset(Dataset):
    def __init__(self, text, word2idx=None, idx2word=None, seq_len=5):
        words = text.strip().split()
        self.seq_len = seq_len

        # Only build vocab if not provided
        if word2idx is None or idx2word is None:
            vocab = sorted(set(words))
            self.word2idx = {word: idx for idx, word in enumerate(vocab)}
            self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        else:
            self.word2idx = word2idx
            self.idx2word = idx2word

        # Make sure all words exist in vocab
        self.data = [self.word2idx[word] for word in words if word in self.word2idx]

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx:idx + self.seq_len])
        y = torch.tensor(self.data[idx + 1:idx + self.seq_len + 1])
        return x, y
# Build vocab from training data only
base_dataset = WordDataset(' '.join(train_lines[:50000]))
word2idx = base_dataset.word2idx
idx2word = base_dataset.idx2word

# Pass vocab to other splits
train_dataset = base_dataset
val_dataset = WordDataset(' '.join(val_lines), word2idx, idx2word)
test_dataset = WordDataset(' '.join(test_lines), word2idx, idx2word)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size*20, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


In [15]:
vocab_size = len(word2idx)

In [16]:


class UniversalRNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers,
                 mode='rnn', dropout=0.1,
                 device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super(UniversalRNNModel, self).__init__()
        
        assert mode in ['rnn', 'lstm', 'gru'], "Mode must be one of: 'rnn', 'lstm', 'gru'"
        self.mode = mode
        self.device = device
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.norm = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.relu = nn.ReLU()

        rnn_class = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[mode]
        self.rnn = rnn_class(embedding_dim, hidden_dim, num_layers, batch_first=True)

        self.fc1 = nn.Linear(hidden_dim, 128)
        self.fc2 = nn.Linear(128, vocab_size)

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.parameters(), lr=0.003)

    def init_hidden(self, batch_size):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
        if self.mode == 'lstm':
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(self.device)
            return (h0, c0)
        else:
            return h0

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.rnn(self.norm(x), hidden)
        out = self.dropout(out)
        out = torch.tanh(out.reshape(-1, self.hidden_dim))
        out = self.fc2(self.relu(self.dropout(self.fc1(out))))
        out = out.reshape(x.shape[0], x.shape[1], -1)
        return out, hidden

    def train_step(self, loader):
        self.train()
        total_loss = 0
        prog_bar = tqdm(loader, desc="Training", leave=False)
    
        for x, y in prog_bar:
            x, y = x.to(self.device), y.to(self.device)
            hidden = self.init_hidden(x.size(0))
            self.optimizer.zero_grad()
    
            logits, _ = self(x, hidden)
            loss = self.criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            self.optimizer.step()
    
            total_loss += loss.item()
            avg_loss = total_loss / (prog_bar.n + 1)
            prog_bar.set_postfix(loss=f"{avg_loss:.4f}")
    
        return total_loss / len(loader)

    def val_step(self, loader):
        self.eval()
        total_loss = 0
        total_tokens = 0
        correct_preds = 0
    
        prog_bar = tqdm(loader, desc="Validating", leave=False)
    
        with torch.no_grad():
            for x, y in prog_bar:
                x, y = x.to(self.device), y.to(self.device)
                hidden = self.init_hidden(x.size(0))
                logits, _ = self(x, hidden)
    
                loss = self.criterion(logits.view(-1, logits.size(-1)), y.view(-1))
                total_loss += loss.item() * x.size(0)
                total_tokens += x.numel()
    
                preds = torch.argmax(logits, dim=-1)
                correct_preds += (preds == y).sum().item()
    
                avg_loss = total_loss / (prog_bar.n * loader.batch_size + x.size(0))
                prog_bar.set_postfix(loss=f"{avg_loss:.4f}")
    
        avg_loss = total_loss / len(loader.dataset)
        perplexity = math.exp(avg_loss)
        accuracy = correct_preds / total_tokens
    
        return avg_loss, perplexity, accuracy

    def test_step(self, loader):
        return self.val_step(loader)

    def predict(self, x):
        self.eval()
        x = x.to(self.device)
        hidden = self.init_hidden(x.size(0))
        logits, _ = self(x, hidden)
        output = torch.argmax(logits, dim=-1)[:, 0]
        return output, torch.cat((x, output.unsqueeze(1)), dim=1)


In [23]:
def train_model(model, train_loader, val_loader, epochs=5):
    for epoch in range(1, epochs + 1):
        print(f"\n🌀 Epoch {epoch}/{epochs}")
        train_loss = model.train_step(train_loader)
        val_loss, ppl, acc = model.val_step(val_loader)
        
        print(f"📉 Train Loss: {train_loss:.4f} | 🔍 Val Loss: {val_loss:.4f}")
        print(f"🧠 Val Loss: {val_loss:.4f} | 🤯 Perplexity: {ppl:.2f} | 🎯 Accuracy: {acc*100:.2f}%")


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [25]:
model_rnn = UniversalRNNModel(vocab_size, embedding_dim, hidden_dim, num_layers, mode='rnn').to(device)
model_lstm = UniversalRNNModel(vocab_size, embedding_dim, hidden_dim, num_layers, mode='lstm').to(device)
model_gru = UniversalRNNModel(vocab_size, embedding_dim, hidden_dim, num_layers, mode='gru').to(device)


In [27]:
def generate_text(model, word2idx, idx2word, seed_text, max_length=50, device=None):
    """
    Generate text using a trained SimpleRNN model.

    Args:
        model (SimpleRNN): Trained model.
        word2idx (dict): Mapping from word to index.
        idx2word (dict): Mapping from index to word.
        seed_text (str): Seed input text.
        max_length (int): Total length of the generated sequence.
        device: Device to run the model on.

    Returns:
        str: Generated text sequence.
    """
    model.eval()
    device = device or model.device
    model.to(device)

    words = seed_text.strip().split()
    input_ids = [word2idx.get(w, word2idx[list(word2idx.keys())[0]]) for w in words]
    input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)  # shape: (1, seq_len)

    hidden = model.init_hidden(1)
    generated = input_tensor.clone()

    for _ in range(max_length - len(input_ids)):
        logits, hidden = model(generated, hidden)
        next_token_logits = logits[:, -1, :]  # shape: (1, vocab_size)
        next_token = torch.argmax(next_token_logits, dim=-1)  # shape: (1,)
        generated = torch.cat((generated, next_token.unsqueeze(1)), dim=1)

    generated_words = [idx2word[idx] for idx in generated[0].tolist()]
    return ' '.join(generated_words)


In [31]:
# text = generate_text(model_rnn, word2idx = word2idx, idx2word = idx2word, seed_text="who had also worked on the", max_length=30)
# print(text)
text = generate_text(model_lstm, word2idx = word2idx, idx2word = idx2word, seed_text="who had also worked on the", max_length=30)
print(text)
text = generate_text(model_gru, word2idx = word2idx, idx2word = idx2word, seed_text="who had also worked on the", max_length=30)
print(text)


who had also worked on the album . The song was released in the United States . The first two @-@ year @-@ old Sammo Hung , and the first
who had also worked on the album 's first episode of the song " The " " " The " " " " " " " " " " "
