In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
import random
import os


In [7]:
# Example data (Latin to Devanagari character sequence)
data_pairs = [
    ("namaste", "नमस्ते"),
    ("shiva", "शिव"),
    ("ganesha", "गणेश"),
    ("ram", "राम"),
    ("krishna", "कृष्ण"),
    ("sita", "सीता")
]

# Extract character sets
input_chars = sorted(list(set("".join(pair[0] for pair in data_pairs))))
output_chars = sorted(list(set("".join(pair[1] for pair in data_pairs))))

input_char2idx = {ch: idx + 1 for idx, ch in enumerate(input_chars)}  # index 0 is for PAD
input_char2idx["<PAD>"] = 0
output_char2idx = {ch: idx + 1 for idx, ch in enumerate(output_chars)}
output_char2idx["<PAD>"] = 0
output_char2idx["<SOS>"] = len(output_char2idx)
output_char2idx["<EOS>"] = len(output_char2idx)

idx2output_char = {idx: ch for ch, idx in output_char2idx.items()}

# Parameters
MAX_LENGTH = max(max(len(x), len(y)) for x, y in data_pairs) + 2  # +2 for <SOS> and <EOS>


In [8]:
class CharSeqDataset(Dataset):
    def __init__(self, data_pairs):
        self.data = data_pairs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        src_seq = [input_char2idx[ch] for ch in src]
        tgt_seq = [output_char2idx["<SOS>"]] + [output_char2idx[ch] for ch in tgt] + [output_char2idx["<EOS>"]]

        src_seq += [input_char2idx["<PAD>"]] * (MAX_LENGTH - len(src_seq))
        tgt_seq += [output_char2idx["<PAD>"]] * (MAX_LENGTH - len(tgt_seq))

        return torch.tensor(src_seq), torch.tensor(tgt_seq[:-1]), torch.tensor(tgt_seq[1:])  # input, decoder_input, target


In [9]:
class Seq2SeqModel(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, embedding_dim, hidden_dim,
                 num_layers=1, cell_type='lstm'):
        super(Seq2SeqModel, self).__init__()

        self.cell_type = cell_type.lower()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.encoder_embedding = nn.Embedding(input_vocab_size, embedding_dim, padding_idx=0)
        self.decoder_embedding = nn.Embedding(target_vocab_size, embedding_dim, padding_idx=0)

        rnn = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}[self.cell_type]

        self.encoder = rnn(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.decoder = rnn(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)

        self.output_fc = nn.Linear(hidden_dim, target_vocab_size)

    def forward(self, src, tgt):
        batch_size = src.size(0)

        # Embeddings
        src_embed = self.encoder_embedding(src)
        tgt_embed = self.decoder_embedding(tgt)

        # Encoder
        _, hidden = self.encoder(src_embed)

        # Decoder
        output, _ = self.decoder(tgt_embed, hidden)

        return self.output_fc(output)


In [10]:
def train_seq2seq(model, dataloader, criterion, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt_input, tgt_output in dataloader:
            src, tgt_input, tgt_output = src.to(device), tgt_input.to(device), tgt_output.to(device)

            optimizer.zero_grad()
            outputs = model(src, tgt_input)  # (B, T, V)
            loss = criterion(outputs.view(-1, outputs.size(-1)), tgt_output.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")


In [19]:
def predict_seq2seq(model, src_seq, device, max_len=MAX_LENGTH):
    model.eval()

    # Convert input characters to indices
    src_indices = [input_char2idx[ch] for ch in src_seq]
    src_indices += [input_char2idx["<PAD>"]] * (MAX_LENGTH - len(src_indices))
    src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)  # (1, T)

    with torch.no_grad():
        # Encoder forward
        encoder_embedded = model.encoder_embedding(src_tensor)  # (1, T, embed_dim)
        _, hidden = model.encoder(encoder_embedded)

        # Start decoder with <SOS>
        decoder_input = torch.tensor([[output_char2idx["<SOS>"]]], dtype=torch.long).to(device)  # (1, 1)
        decoded_output = []

        for _ in range(max_len):
            decoder_embedded = model.decoder_embedding(decoder_input)  # (1, 1, embed_dim)
            output, hidden = model.decoder(decoder_embedded, hidden)   # (1, 1, hidden_dim)
            logits = model.output_fc(output[:, 0, :])                   # (1, vocab_size)
            predicted_id = logits.argmax(1).item()

            if idx2output_char[predicted_id] == "<EOS>":
                break

            decoded_output.append(idx2output_char[predicted_id])
            decoder_input = torch.tensor([[predicted_id]], dtype=torch.long).to(device)  # (1, 1)

    return ''.join(decoded_output)


In [21]:
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
embedding_dim = 64
hidden_dim = 128
num_layers = 1
cell_type = 'lstm'

# Vocabulary sizes
input_vocab_size = len(input_char2idx)
target_vocab_size = len(output_char2idx)

# Dataset and DataLoader
dataset = CharSeqDataset(data_pairs)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Model
model = Seq2SeqModel(input_vocab_size, target_vocab_size, embedding_dim, hidden_dim,
                     num_layers=num_layers, cell_type=cell_type).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train
train_seq2seq(model, dataloader, criterion, optimizer, device, epochs=30)

print("\nSample Predictions:")
for x, y in data_pairs:
    prediction = predict_seq2seq(model, x, device)
    print(f"{x} -> {prediction}")




Epoch 1, Loss: 2.9893
Epoch 2, Loss: 2.9047
Epoch 3, Loss: 2.8330
Epoch 4, Loss: 2.7629
Epoch 5, Loss: 2.6846
Epoch 6, Loss: 2.5967
Epoch 7, Loss: 2.4911
Epoch 8, Loss: 2.3620
Epoch 9, Loss: 2.2039
Epoch 10, Loss: 2.0232
Epoch 11, Loss: 1.8540
Epoch 12, Loss: 1.6640
Epoch 13, Loss: 1.4942
Epoch 14, Loss: 1.3076
Epoch 15, Loss: 1.1343
Epoch 16, Loss: 0.9805
Epoch 17, Loss: 0.8342
Epoch 18, Loss: 0.7057
Epoch 19, Loss: 0.5953
Epoch 20, Loss: 0.5129
Epoch 21, Loss: 0.4300
Epoch 22, Loss: 0.3679
Epoch 23, Loss: 0.3119
Epoch 24, Loss: 0.2690
Epoch 25, Loss: 0.2337
Epoch 26, Loss: 0.1991
Epoch 27, Loss: 0.1798
Epoch 28, Loss: 0.1579
Epoch 29, Loss: 0.1426
Epoch 30, Loss: 0.1282

Sample Predictions:
namaste -> नमस्ते
shiva -> शिव
ganesha -> गणेश
ram -> राम
krishna -> कृष्ण
sita -> सीता
