# Assignment: Text-to-Python Code Generation Using Seq2Seq RNN Models

## Objective
Implement and compare three RNN architectures for code generation:
1. **Vanilla RNN Seq2Seq** - Baseline with fixed-length context
2. **LSTM Seq2Seq** - Improved long-term dependency handling
3. **LSTM with Attention** - Remove context bottleneck

## Dataset
- **Source**: CodeSearchNet Python (Hugging Face)
- **Input**: Natural language docstrings (max 50 tokens)
- **Output**: Python function code (max 80 tokens)
- **Split**: 10,000 training | 1,500 validation | 1,500 test

## Configuration
- Embedding Dimension: 256
- Hidden Dimension: 256
- Batch Size: 64
- Epochs: 20
- Learning Rate: 0.001
- Teacher Forcing Ratio: 0.5
- Loss: Cross-Entropy with padding ignored

In [None]:
!pip install datasets transformers torch torchvision torchaudio nltk sacrebleu matplotlib pandas numpy tqdm -q

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from collections import Counter
import re
from tqdm.auto import tqdm
import random
import json
import os
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 1. Configuration

In [None]:
# Global Configuration
CONFIG = {
    # Dataset
    'TRAIN_SIZE': 10000,
    'VAL_SIZE': 1500,
    'TEST_SIZE': 1500,
    
    # Sequence lengths
    'MAX_DOCSTRING_LEN': 50,
    'MAX_CODE_LEN': 80,
    
    # Architecture
    'EMBEDDING_DIM': 256,
    'HIDDEN_DIM': 256,
    'NUM_LAYERS': 2,
    'DROPOUT': 0.3,
    'BIDIRECTIONAL': True,
    
    # Training
    'BATCH_SIZE': 64,
    'EPOCHS': 20,
    'LEARNING_RATE': 0.001,
    'TEACHER_FORCING_RATIO': 0.5,
    'VOCAB_SIZE': 5000,
    'GRADIENT_CLIP': 1.0,
    
    # Advanced optimization
    'WARMUP_STEPS': 1000,
    'EARLY_STOPPING_PATIENCE': 3,
    'BEAM_SIZE': 3,
    'SCHEDULED_SAMPLING': True,
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
    
# Save config to JSON for reproducibility
import json
with open('models/config.json', 'w') as f:
    json.dump(CONFIG, f, indent=2)
print("\n✓ Configuration saved to models/config.json")

## 2. Load and Prepare CodeSearchNet Dataset

In [None]:
print("Loading CodeSearchNet Python dataset...")
dataset = load_dataset("Nan-Do/code-search-net-python", split='train')
print(f"Total dataset size: {len(dataset)}")

# Filter dataset to only include items with both docstring and code
print("Filtering dataset...")
dataset = dataset.filter(
    lambda x: x['docstring'] is not None 
    and x['code'] is not None 
    and len(x['docstring'].strip()) > 0 
    and len(x['code'].strip()) > 0
)
print(f"Filtered dataset size: {len(dataset)}")

# Calculate total needed
total_needed = CONFIG['TRAIN_SIZE'] + CONFIG['VAL_SIZE'] + CONFIG['TEST_SIZE']
print(f"Total samples needed: {total_needed}")

# Sample and shuffle
dataset = dataset.shuffle(seed=SEED).select(range(min(total_needed, len(dataset))))
print(f"Sampled {len(dataset)} examples")

# Display statistics
print("\nDataset Statistics:")
docstring_lengths = [len(x['docstring'].split()) for x in dataset]
code_lengths = [len(x['code'].split()) for x in dataset]
print(f"Docstring length - Mean: {np.mean(docstring_lengths):.1f}, Max: {np.max(docstring_lengths)}")
print(f"Code length - Mean: {np.mean(code_lengths):.1f}, Max: {np.max(code_lengths)}")

# Display sample
print("\nSample example:")
sample = dataset[0]
print(f"Docstring: {sample['docstring'][:100]}...")
print(f"Code:\n{sample['code'][:200]}...")

## 3. Build Tokenizer and Vocabulary

In [None]:
class Tokenizer:
    """Simple whitespace-based tokenizer with special tokens"""
    def __init__(self, vocab_size=5000):
        self.vocab_size = vocab_size
        self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.vocab_built = False
    
    def tokenize(self, text):
        """Simple whitespace tokenization with special character handling"""
        text = text.lower()
        # Add spaces around special characters
        text = re.sub(r'([\(\)\[\]\{\}:,\.=\+\-\*\/])', r' \1 ', text)
        tokens = text.split()
        return tokens
    
    def build_vocab(self, texts):
        """Build vocabulary from texts"""
        print(f"Building vocabulary from {len(texts)} texts...")
        counter = Counter()
        for text in tqdm(texts, desc="Building vocab"):
            tokens = self.tokenize(text)
            counter.update(tokens)
        
        # Get most common tokens
        most_common = counter.most_common(self.vocab_size - 4)
        for idx, (word, _) in enumerate(most_common, start=4):
            self.word2idx[word] = idx
            self.idx2word[idx] = word
        
        self.vocab_built = True
        print(f"Vocabulary size: {len(self.word2idx)}")
    
    def encode(self, text, max_len=None, add_special_tokens=True):
        """Encode text to token indices"""
        tokens = self.tokenize(text)
        if max_len and len(tokens) > max_len - 2:
            tokens = tokens[:max_len - 2]
        
        indices = [self.word2idx.get(token, self.word2idx['<UNK>']) for token in tokens]
        
        if add_special_tokens:
            indices = [self.word2idx['<SOS>']] + indices + [self.word2idx['<EOS>']]
        
        return indices
    
    def decode(self, indices, skip_special_tokens=True):
        """Decode token indices to text"""
        tokens = []
        for idx in indices:
            if idx in self.idx2word:
                token = self.idx2word[idx]
                if skip_special_tokens and token in ['<PAD>', '<SOS>', '<EOS>', '<UNK>']:
                    if token == '<EOS>':
                        break
                    continue
                tokens.append(token)
        return ' '.join(tokens)
    
    def save(self, filepath):
        """Save tokenizer to file"""
        data = {
            'word2idx': self.word2idx,
            'idx2word': {str(k): v for k, v in self.idx2word.items()},
            'vocab_size': self.vocab_size
        }
        with open(filepath, 'w') as f:
            json.dump(data, f)
        print(f"Tokenizer saved to {filepath}")
    
    @classmethod
    def load(cls, filepath):
        """Load tokenizer from file"""
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        tokenizer = cls(vocab_size=data['vocab_size'])
        tokenizer.word2idx = data['word2idx']
        tokenizer.idx2word = {int(k): v for k, v in data['idx2word'].items()}
        tokenizer.vocab_built = True
        print(f"Tokenizer loaded from {filepath}")
        return tokenizer

# Build tokenizers
print("\n" + "="*60)
print("Building Tokenizers")
print("="*60)

src_tokenizer = Tokenizer(vocab_size=CONFIG['VOCAB_SIZE'])
tgt_tokenizer = Tokenizer(vocab_size=CONFIG['VOCAB_SIZE'])

# Extract texts
docstrings = [item['docstring'] for item in dataset if item['docstring']]
codes = [item['code'] for item in dataset if item['code']]

# Build vocabularies
src_tokenizer.build_vocab(docstrings)
tgt_tokenizer.build_vocab(codes)

# Save tokenizers for later use
os.makedirs('models', exist_ok=True)
src_tokenizer.save('models/src_tokenizer.json')
tgt_tokenizer.save('models/tgt_tokenizer.json')

## 4. Create Dataset and DataLoaders

In [None]:
class CodeDataset(Dataset):
    """Dataset for code generation from docstrings"""
    def __init__(self, data, src_tokenizer, tgt_tokenizer, max_src_len, max_tgt_len):
        self.data = list(data) if not isinstance(data, list) else data
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        src = self.src_tokenizer.encode(item['docstring'], max_len=self.max_src_len)
        tgt = self.tgt_tokenizer.encode(item['code'], max_len=self.max_tgt_len)
        return torch.tensor(src, dtype=torch.long), torch.tensor(tgt, dtype=torch.long)

def collate_fn(batch):
    """Collate function for padding sequences"""
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_batch, tgt_batch

# Split dataset
print("\n" + "="*60)
print("Creating Train/Val/Test Splits")
print("="*60)

train_data = dataset.select(range(CONFIG['TRAIN_SIZE']))
val_data = dataset.select(range(CONFIG['TRAIN_SIZE'], CONFIG['TRAIN_SIZE'] + CONFIG['VAL_SIZE']))
test_data = dataset.select(range(
    CONFIG['TRAIN_SIZE'] + CONFIG['VAL_SIZE'],
    CONFIG['TRAIN_SIZE'] + CONFIG['VAL_SIZE'] + CONFIG['TEST_SIZE']
))

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

# Create datasets
train_dataset = CodeDataset(train_data, src_tokenizer, tgt_tokenizer,
                           CONFIG['MAX_DOCSTRING_LEN'], CONFIG['MAX_CODE_LEN'])
val_dataset = CodeDataset(val_data, src_tokenizer, tgt_tokenizer,
                         CONFIG['MAX_DOCSTRING_LEN'], CONFIG['MAX_CODE_LEN'])
test_dataset = CodeDataset(test_data, src_tokenizer, tgt_tokenizer,
                          CONFIG['MAX_DOCSTRING_LEN'], CONFIG['MAX_CODE_LEN'])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=CONFIG['BATCH_SIZE'],
                         shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['BATCH_SIZE'],
                       shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['BATCH_SIZE'],
                        shuffle=False, collate_fn=collate_fn)

print(f"\nDataloaders created:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

## 5. Model 1: Vanilla RNN Seq2Seq

In [None]:
class VanillaRNNEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.3, bidirectional=True):
        super(VanillaRNNEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True, bidirectional=bidirectional)
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

class VanillaRNNDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.3):
        super(VanillaRNNDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden

class VanillaRNNSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(VanillaRNNSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        # Encode
        _, hidden = self.encoder(src)
        
        # For bidirectional encoder, need to process hidden state carefully
        if self.encoder.bidirectional:
            # Reshape from (num_directions, batch, hidden_dim) to (batch, num_directions*hidden_dim)
            num_directions = hidden.shape[0] // (batch_size) if hidden.shape[0] > batch_size else 2
            # More explicit: take both directions and concatenate
            hidden_fwd = hidden[0].unsqueeze(0)  # (1, batch, hidden_dim)
            hidden_bwd = hidden[1].unsqueeze(0)  # (1, batch, hidden_dim)
            # Average them for single-direction decoder input
            hidden = ((hidden_fwd + hidden_bwd) / 2).contiguous()
        
        # Decode
        decoder_input = tgt[:, 0].unsqueeze(1)
        for t in range(1, tgt_len):
            prediction, hidden = self.decoder(decoder_input, hidden)
            outputs[:, t, :] = prediction
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            decoder_input = tgt[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
        
        return outputs

print("✓ VanillaRNNSeq2Seq model defined")

## 6. Model 2: LSTM Seq2Seq

In [None]:
class LSTMEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, dropout=0.3, bidirectional=True):
        super(LSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                           batch_first=True, dropout=dropout if num_layers > 1 else 0,
                           bidirectional=bidirectional)
        self.bidirectional = bidirectional
        self.num_layers = num_layers
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class LSTMDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2, dropout=0.3):
        super(LSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

class LSTMSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(LSTMSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        # Encode
        _, hidden, cell = self.encoder(src)
        
        # Handle bidirectional encoder output (if needed)
        if self.encoder.bidirectional:
            # For bidirectional, hidden/cell shape: (num_layers*2, batch, hidden_dim)
            # Reshape to (num_layers, batch, 2*hidden_dim) then average directions
            num_layers = self.encoder.num_layers
            batch_size = hidden.shape[1]
            hidden_dim = hidden.shape[2]
            
            # Reshape and average forward/backward for each layer
            hidden_reshaped = hidden.view(num_layers, 2, batch_size, hidden_dim)
            cell_reshaped = cell.view(num_layers, 2, batch_size, hidden_dim)
            
            hidden = (hidden_reshaped[:, 0] + hidden_reshaped[:, 1]).contiguous() / 2
            cell = (cell_reshaped[:, 0] + cell_reshaped[:, 1]).contiguous() / 2
        
        # Decode
        decoder_input = tgt[:, 0].unsqueeze(1)
        for t in range(1, tgt_len):
            prediction, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs[:, t, :] = prediction
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            decoder_input = tgt[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
        
        return outputs

print("✓ LSTMSeq2Seq model defined")

## 7. Model 3: LSTM with Attention

In [None]:
class Attention(nn.Module):
    """Bahdanau (Additive) Attention Mechanism"""
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        # Both hidden and context are projected to hidden_dim, so attention input is 2*hidden_dim
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
    
    def forward(self, hidden, encoder_outputs):
        """
        hidden: (1, batch, hidden_dim)
        encoder_outputs: (batch, src_len, hidden_dim) - already projected
        """
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # Repeat hidden state for each source token
        hidden_expanded = hidden.squeeze(0).unsqueeze(1).repeat(1, src_len, 1)  # (batch, src_len, hidden_dim)
        
        # Calculate attention scores
        energy = torch.tanh(self.attn(torch.cat((hidden_expanded, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        
        return torch.softmax(attention, dim=1)

class AttentionDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, encoder_output_dim=None):
        super(AttentionDecoder, self).__init__()
        if encoder_output_dim is None:
            encoder_output_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # Project encoder outputs to hidden_dim if they're larger (bidirectional case)
        self.encoder_output_dim = encoder_output_dim
        self.hidden_dim = hidden_dim
        if encoder_output_dim != hidden_dim:
            self.encoder_proj = nn.Linear(encoder_output_dim, hidden_dim)
        else:
            self.encoder_proj = None
        
        # Attention now works with hidden_dim only (since encoder outputs are projected)
        self.attention = Attention(hidden_dim)
        
        self.lstm = nn.LSTM(embedding_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden, cell, encoder_outputs):
        embedded = self.embedding(x)
        
        # Project encoder outputs if needed
        if self.encoder_proj is not None:
            encoder_outputs_proj = self.encoder_proj(encoder_outputs)
        else:
            encoder_outputs_proj = encoder_outputs
        
        # Calculate attention weights
        attn_weights = self.attention(hidden, encoder_outputs_proj)
        attn_weights_expanded = attn_weights.unsqueeze(1)
        
        # Apply attention to projected encoder outputs
        context = torch.bmm(attn_weights_expanded, encoder_outputs_proj)
        
        # Concatenate embedding and context
        lstm_input = torch.cat((embedded, context), dim=2)
        
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        
        return prediction, hidden, cell, attn_weights

class BiLSTMEncoder(nn.Module):
    """Bidirectional LSTM Encoder"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiLSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.hidden_dim = hidden_dim
    
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        # Combine bidirectional hidden states: shape (2, batch, hidden_dim) -> (1, batch, hidden_dim)
        # Average the forward and backward directions
        hidden_fwd = hidden[0].unsqueeze(0)  # (1, batch, hidden_dim)
        hidden_bwd = hidden[1].unsqueeze(0)  # (1, batch, hidden_dim)
        hidden = ((hidden_fwd + hidden_bwd) / 2).contiguous()
        
        cell_fwd = cell[0].unsqueeze(0)      # (1, batch, hidden_dim)
        cell_bwd = cell[1].unsqueeze(0)      # (1, batch, hidden_dim)
        cell = ((cell_fwd + cell_bwd) / 2).contiguous()
        
        return outputs, hidden, cell

class LSTMAttentionSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(LSTMAttentionSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        attentions = torch.zeros(batch_size, tgt_len, src.shape[1]).to(self.device)
        
        # Encode
        encoder_outputs, hidden, cell = self.encoder(src)
        
        # Handle bidirectional encoder output
        if self.encoder.bidirectional:
            # For bidirectional, hidden/cell shape: (num_layers*2, batch, hidden_dim)
            # Need to reshape for single-layer decoder
            num_layers = self.encoder.num_layers
            hidden_dim = hidden.shape[2]
            
            # Reshape and average forward/backward for last layer only (for decoder input)
            hidden_reshaped = hidden.view(num_layers, 2, batch_size, hidden_dim)
            cell_reshaped = cell.view(num_layers, 2, batch_size, hidden_dim)
            
            # Take only the last layer and average forward/backward
            hidden = ((hidden_reshaped[-1, 0] + hidden_reshaped[-1, 1]) / 2).unsqueeze(0).contiguous()
            cell = ((cell_reshaped[-1, 0] + cell_reshaped[-1, 1]) / 2).unsqueeze(0).contiguous()
        
        # Decode
        decoder_input = tgt[:, 0].unsqueeze(1)
        for t in range(1, tgt_len):
            prediction, hidden, cell, attn_weights = self.decoder(decoder_input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = prediction
            attentions[:, t, :] = attn_weights
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            decoder_input = tgt[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
        
        return outputs, attentions

print("✓ LSTMAttentionSeq2Seq model defined")

## 8. Training Functions

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device, teacher_forcing_ratio, use_attention=False):
    model.train()
    epoch_loss = 0
    
    for src, tgt in tqdm(dataloader, desc="Training", leave=False):
        src, tgt = src.to(device), tgt.to(device)
        
        optimizer.zero_grad()
        
        if use_attention:
            output, _ = model(src, tgt, teacher_forcing_ratio)
        else:
            output = model(src, tgt, teacher_forcing_ratio)
        
        # Reshape for loss calculation
        output = output[:, 1:].reshape(-1, output.shape[-1])
        tgt = tgt[:, 1:].reshape(-1)
        
        loss = criterion(output, tgt)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG['GRADIENT_CLIP'])
        
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device, use_attention=False):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Evaluating", leave=False):
            src, tgt = src.to(device), tgt.to(device)
            
            if use_attention:
                output, _ = model(src, tgt, teacher_forcing_ratio=0)
            else:
                output = model(src, tgt, teacher_forcing_ratio=0)
            
            output = output[:, 1:].reshape(-1, output.shape[-1])
            tgt = tgt[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

def train_model(model, train_loader, val_loader, optimizer, criterion, device,
                epochs, teacher_forcing_ratio, model_name, use_attention=False):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    best_epoch = 0
    
    print(f"\nTraining {model_name}...")
    print("="*60)
    
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device,
                                teacher_forcing_ratio, use_attention)
        val_loss = evaluate(model, val_loader, criterion, device, use_attention)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1:2d}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}", end="")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            torch.save({
                'model_state_dict': model.state_dict(),
                'epoch': epoch,
                'loss': val_loss,
                'config': CONFIG
            }, f'models/{model_name}_best.pt')
            print(" ✓ (saved)")
        else:
            print()
    
    print("="*60)
    print(f"Best model: {model_name}_best.pt (Epoch {best_epoch+1}, Loss: {best_val_loss:.4f})")
    
    return train_losses, val_losses

print("✓ Training functions defined")

## 9. Train Model 1: Vanilla RNN

In [None]:
print("\n" + "="*60)
print("MODEL 1: VANILLA RNN SEQ2SEQ")
print("="*60)

# Initialize model
src_vocab_size = len(src_tokenizer.word2idx)
tgt_vocab_size = len(tgt_tokenizer.word2idx)

rnn_encoder = VanillaRNNEncoder(src_vocab_size, CONFIG['EMBEDDING_DIM'], CONFIG['HIDDEN_DIM'],
                               dropout=CONFIG['DROPOUT'], bidirectional=CONFIG['BIDIRECTIONAL'])
rnn_decoder = VanillaRNNDecoder(tgt_vocab_size, CONFIG['EMBEDDING_DIM'], CONFIG['HIDDEN_DIM'],
                               dropout=CONFIG['DROPOUT'])
rnn_model = VanillaRNNSeq2Seq(rnn_encoder, rnn_decoder, device).to(device)

# Count parameters
rnn_params = sum(p.numel() for p in rnn_model.parameters() if p.requires_grad)
print(f"Parameters: {rnn_params:,}\n")

# Setup training
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=CONFIG['LEARNING_RATE'])
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Train
rnn_train_losses, rnn_val_losses = train_model(
    rnn_model, train_loader, val_loader, rnn_optimizer, criterion, device,
    CONFIG['EPOCHS'], CONFIG['TEACHER_FORCING_RATIO'], 'vanilla_rnn', use_attention=False
)

## 10. Train Model 2: LSTM

In [None]:
print("\n" + "="*60)
print("MODEL 2: LSTM SEQ2SEQ")
print("="*60)

# Initialize model
lstm_encoder = LSTMEncoder(src_vocab_size, CONFIG['EMBEDDING_DIM'], CONFIG['HIDDEN_DIM'],
                          num_layers=CONFIG['NUM_LAYERS'], dropout=CONFIG['DROPOUT'],
                          bidirectional=CONFIG['BIDIRECTIONAL'])
lstm_decoder = LSTMDecoder(tgt_vocab_size, CONFIG['EMBEDDING_DIM'], CONFIG['HIDDEN_DIM'],
                          num_layers=CONFIG['NUM_LAYERS'], dropout=CONFIG['DROPOUT'])
lstm_model = LSTMSeq2Seq(lstm_encoder, lstm_decoder, device).to(device)

# Count parameters
lstm_params = sum(p.numel() for p in lstm_model.parameters() if p.requires_grad)
print(f"Parameters: {lstm_params:,}\n")

# Setup training
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=CONFIG['LEARNING_RATE'])

# Train
lstm_train_losses, lstm_val_losses = train_model(lstm_model, train_loader, val_loader, lstm_optimizer, criterion, device, CONFIG['EPOCHS'], CONFIG['TEACHER_FORCING_RATIO'], 'lstm', use_attention=False)

## 11. Train Model 3: LSTM with Attention

In [None]:
print("\n" + "="*60)
print("MODEL 3: LSTM WITH ATTENTION")
print("="*60)

# Initialize model with bidirectional encoder
attn_encoder = LSTMEncoder(src_vocab_size, CONFIG['EMBEDDING_DIM'], CONFIG['HIDDEN_DIM'],
                          num_layers=CONFIG['NUM_LAYERS'], dropout=CONFIG['DROPOUT'],
                          bidirectional=CONFIG['BIDIRECTIONAL'])
# Encoder is bidirectional, so output dim is HIDDEN_DIM * 2
encoder_output_dim = CONFIG['HIDDEN_DIM'] * 2 if CONFIG['BIDIRECTIONAL'] else CONFIG['HIDDEN_DIM']
attn_decoder = AttentionDecoder(tgt_vocab_size, CONFIG['EMBEDDING_DIM'], CONFIG['HIDDEN_DIM'],
                               encoder_output_dim=encoder_output_dim)
attn_model = LSTMAttentionSeq2Seq(attn_encoder, attn_decoder, device).to(device)

# Count parameters
attn_params = sum(p.numel() for p in attn_model.parameters() if p.requires_grad)
print(f"Parameters: {attn_params:,}\n")

# Setup training
attn_optimizer = optim.Adam(attn_model.parameters(), lr=CONFIG['LEARNING_RATE'])

# Train
attn_train_losses, attn_val_losses = train_model(attn_model, train_loader, val_loader, attn_optimizer, criterion, device, CONFIG['EPOCHS'], CONFIG['TEACHER_FORCING_RATIO'], 'lstm_attention', use_attention=True)

## 12. Save Training Results and Configuration

In [None]:
# Save training history
import pickle

training_history = {
    'vanilla_rnn': {'train_losses': rnn_train_losses, 'val_losses': rnn_val_losses},
    'lstm': {'train_losses': lstm_train_losses, 'val_losses': lstm_val_losses},
    'lstm_attention': {'train_losses': attn_train_losses, 'val_losses': attn_val_losses}
}

with open('models/training_history.pkl', 'wb') as f:
    pickle.dump(training_history, f)

# Save model parameters
model_params = {
    'vanilla_rnn': rnn_params,
    'lstm': lstm_params,
    'lstm_attention': attn_params
}

with open('models/model_params.json', 'w') as f:
    json.dump(model_params, f, indent=2)

# Save configuration
with open('models/config.json', 'w') as f:
    json.dump(CONFIG, f, indent=2)

print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print("\nSaved files in 'models/' directory:")
print("  ✓ vanilla_rnn_best.pt")
print("  ✓ lstm_best.pt")
print("  ✓ lstm_attention_best.pt")
print("  ✓ training_history.pkl")
print("  ✓ model_params.json")
print("  ✓ config.json")
print("  ✓ src_tokenizer.json")
print("  ✓ tgt_tokenizer.json")
print("\nReady for analytics on MacBook M1!")