<a href="https://colab.research.google.com/github/Suhaila-Hassan/LSTM-Machine-Translation/blob/main/LSTM_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM Machine Translation (English-French)

In [None]:
#!pip install --upgrade torch torchvision torchaudio
#!pip install --upgrade datasets fsspec

# Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from collections import Counter
import re
import random

# Text Preprocessing

In [None]:
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

def build_vocab(sentences, max_size=5000):
    counter = Counter()
    for s in sentences:
        counter.update(tokenize(s))
    vocab = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    for word, _ in counter.most_common(max_size - len(vocab)):
        vocab[word] = len(vocab)
    return vocab

def encode(sentence, vocab, max_len):
    tokens = ["<sos>"] + tokenize(sentence) + ["<eos>"]
    token_ids = [vocab.get(token, vocab["<unk>"]) for token in tokens]
    return token_ids[:max_len] + [vocab["<pad>"]] * max(0, max_len - len(token_ids))

# Translation Dataset Class

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab, max_len=20):
        self.pairs = [(ex['translation']['en'], ex['translation']['fr']) for ex in data]
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_ids = encode(src, self.src_vocab, self.max_len)
        tgt_ids = encode(tgt, self.tgt_vocab, self.max_len)
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

# LSTM Translation Model Class

In [None]:
class SimpleLSTMTranslator(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim=128, hidden_dim=256, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, _ = self.lstm(embedded)
        logits = self.fc(outputs)
        return logits

# Load and Prepare Dataset

In [None]:
dataset = load_dataset("opus_books", "en-fr", split="train[:25%]")
english_sentences = [x['translation']['en'] for x in dataset]
french_sentences = [x['translation']['fr'] for x in dataset]

src_vocab = build_vocab(english_sentences)
tgt_vocab = build_vocab(french_sentences)
inv_tgt_vocab = {idx: word for word, idx in tgt_vocab.items()}

train_dataset = TranslationDataset(dataset, src_vocab, tgt_vocab, max_len=20)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleLSTMTranslator(input_dim=len(src_vocab), output_dim=len(tgt_vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=src_vocab["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(100):
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in train_loader:
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        optimizer.zero_grad()
        output = model(src_batch)
        loss = criterion(output.view(-1, output.size(-1)), tgt_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

Epoch 1 Loss: 2738.7507
Epoch 2 Loss: 2506.4531
Epoch 3 Loss: 2413.4967
Epoch 4 Loss: 2343.9474
Epoch 5 Loss: 2286.5246
Epoch 6 Loss: 2238.7454
Epoch 7 Loss: 2197.2786
Epoch 8 Loss: 2161.3330
Epoch 9 Loss: 2128.2579
Epoch 10 Loss: 2098.6260
Epoch 11 Loss: 2072.2800
Epoch 12 Loss: 2047.3917
Epoch 13 Loss: 2025.7697
Epoch 14 Loss: 2004.8856
Epoch 15 Loss: 1985.0059
Epoch 16 Loss: 1967.2796
Epoch 17 Loss: 1950.0536
Epoch 18 Loss: 1934.7092
Epoch 19 Loss: 1919.2958
Epoch 20 Loss: 1904.9107
Epoch 21 Loss: 1892.2956
Epoch 22 Loss: 1879.4934
Epoch 23 Loss: 1867.2346
Epoch 24 Loss: 1855.0782
Epoch 25 Loss: 1844.7754
Epoch 26 Loss: 1834.0038
Epoch 27 Loss: 1823.8037
Epoch 28 Loss: 1814.0546
Epoch 29 Loss: 1804.8464
Epoch 30 Loss: 1796.5131
Epoch 31 Loss: 1787.5508
Epoch 32 Loss: 1780.4621
Epoch 33 Loss: 1771.7572
Epoch 34 Loss: 1764.3378
Epoch 35 Loss: 1756.9680
Epoch 36 Loss: 1750.0656
Epoch 37 Loss: 1742.8110
Epoch 38 Loss: 1736.7917
Epoch 39 Loss: 1730.8754
Epoch 40 Loss: 1725.0245
Epoch 41 

# Inference/Translation Function

In [None]:
def translate_sentence(sentence, model, src_vocab, tgt_vocab, inv_tgt_vocab, max_len=20, device='cpu'):
    model.eval()
    input_ids = encode(sentence, src_vocab, max_len)
    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        predicted_ids = output.argmax(dim=-1).squeeze(0).tolist()

    tokens = []
    for idx in predicted_ids:
        word = inv_tgt_vocab.get(idx, "<unk>")
        if word in {"<pad>", "<eos>"}:
            break
        tokens.append(word)
    return " ".join(tokens)

# Translation Example

In [None]:
english_input = "I love books"
french_output = translate_sentence(english_input, model, src_vocab, tgt_vocab, inv_tgt_vocab, device=device)
print("Translation Example")
print(f"English: {english_input}")
print(f"French (predicted): {french_output}")


Translation Example
English: I love books
French (predicted): <sos> je aime ai livres
