<a href="https://colab.research.google.com/github/Reaper-ai/ML_AI/blob/main/04_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Download the sentence and link files
!wget https://downloads.tatoeba.org/exports/sentences.tar.bz2
!wget https://downloads.tatoeba.org/exports/links.tar.bz2

# Step 2: Extract the files
!tar -xvjf sentences.tar.bz2
!tar -xvjf links.tar.bz2

print("dowload complete")

--2025-06-14 09:49:00--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 204819212 (195M) [application/octet-stream]
Saving to: ‘sentences.tar.bz2.2’


2025-06-14 09:49:10 (20.3 MB/s) - ‘sentences.tar.bz2.2’ saved [204819212/204819212]

--2025-06-14 09:49:10--  https://downloads.tatoeba.org/exports/links.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 139235583 (133M) [application/octet-stream]
Saving to: ‘links.tar.bz2.2’


2025-06-14 09:49:18 (20.1 MB/s) - ‘links.tar.bz2.2’ saved [139235583/139235583]

sentences.csv
links.csv
dowload complete


In [None]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


print("setup complete")

setup complete


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
def load_tatoeba_eng_spa(max_words=5, max_samples=10000):
    sentence_dict = {}
    pairs = []

    with open("sentences.csv", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) != 3:
                continue
            sid, lang, text = parts
            if lang in ("eng", "spa"):
                sentence_dict[sid] = (lang, text)

    with open("links.csv", encoding="utf-8") as f:
        for line in f:
            sid1, sid2 = line.strip().split("\t")
            if sid1 in sentence_dict and sid2 in sentence_dict:
                (lang1, text1), (lang2, text2) = sentence_dict[sid1], sentence_dict[sid2]
                if lang1 == "eng" and lang2 == "spa":
                    if len(text1.split()) <= max_words and len(text2.split()) <= max_words:
                        pairs.append((text1.lower(), text2.lower()))
                elif lang1 == "spa" and lang2 == "eng":
                    if len(text2.split()) <= max_words and len(text1.split()) <= max_words:
                        pairs.append((text2.lower(), text1.lower()))
            if len(pairs) >= max_samples:
                break

    return pairs


In [None]:
# tokenize and vocab

from collections import Counter

def tokenize(sentence):
    return sentence.strip().lower().split()

class Vocab:
    def __init__(self, tokens, min_freq=1, specials=["<pad>", "<sos>", "<eos>", "<unk>"]):
        self.freq = Counter(tokens)
        self.itos = specials[:]
        self.itos += [tok for tok, freq in self.freq.items() if freq >= min_freq and tok not in specials]
        self.stoi = {tok: i for i, tok in enumerate(self.itos)}

    def __len__(self):
        return len(self.itos)

    def numericalize(self, tokens):
        return [self.stoi.get(tok, self.stoi["<unk>"]) for tok in tokens]


In [None]:
# build dataset
def build_dataset(pairs):
    src_tokens = []
    tgt_tokens = []

    for src, tgt in pairs:
        src_tokens.extend(tokenize(src))
        tgt_tokens.extend(tokenize(tgt))

    src_vocab = Vocab(src_tokens)
    tgt_vocab = Vocab(tgt_tokens)

    data = []
    for src, tgt in pairs:
        src_ids = [src_vocab.stoi["<sos>"]] + src_vocab.numericalize(tokenize(src)) + [src_vocab.stoi["<eos>"]]
        tgt_ids = [tgt_vocab.stoi["<sos>"]] + tgt_vocab.numericalize(tokenize(tgt)) + [tgt_vocab.stoi["<eos>"]]
        data.append((src_ids, tgt_ids))

    return data, src_vocab, tgt_vocab


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence([torch.tensor(x) for x in src_batch], padding_value=0, batch_first=True)
    tgt_batch = pad_sequence([torch.tensor(x) for x in tgt_batch], padding_value=0, batch_first=True)
    return src_batch, tgt_batch


In [None]:
pairs = load_tatoeba_eng_spa(max_words=12, max_samples=25000)  # You can increase later
data, src_vocab, tgt_vocab = build_dataset(pairs)

from sklearn.model_selection import train_test_split

# 1. Split the dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 2. Wrap in Dataset class
train_dataset = TranslationDataset(train_data)
test_dataset = TranslationDataset(test_data)

# 3. Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)



In [None]:
# Sample check

def decode(tokens, vocab):
    return ' '.join([vocab.itos[token_id] for token_id in tokens if vocab.itos[token_id] not in ("<pad>", "<sos>", "<eos>")])

src, tgt = next(iter(train_loader))

# Pick the first sample
src_sentence = src[0].tolist()
tgt_sentence = tgt[0].tolist()

print("Decoded SRC:", decode(src_sentence, src_vocab))
print("Decoded TGT:", decode(tgt_sentence, tgt_vocab))


Decoded SRC: i thought you liked to learn new things.
Decoded TGT: yo pensaba que te gustaba aprender cosas nuevas.


In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)

    def forward(self, X):
        embedded = self.embedding(X)
        output, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers=1, dropout=0.1):
        super().__init__()
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.out_layer = nn.Linear(hidden_size, output_size)

    def forward(self, input_, hidden):
      embedded = self.embedding(input_)
      out, (hidden, cell) = self.lstm(embedded, hidden)
      return self.out_layer(out), hidden, cell

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, teacher_forcing=0.7):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.teacher_forcing = teacher_forcing

  def forward(self, src, target):

    batch_size = target.size(0)
    target_len = target.size(1)


    hidden, cell = self.encoder(src)

    outputs = torch.zeros(batch_size, target_len, self.decoder.output_size).to(device)

    input_ = target[:, 0].unsqueeze(1)  # <sos>

    for i in range(1,target_len):
      output, hidden, cell = self.decoder(input_, (hidden, cell))

      outputs[:, i] = output.squeeze(1)

      top1 = output.argmax(2).squeeze(1)  # (batch,)
      teacher_force = random.random() < self.teacher_forcing
      input_ = target[:, i].unsqueeze(1) if teacher_force else top1.unsqueeze(1)


    return outputs


In [None]:
# Hyperparameters
input_dim = len(src_vocab)
output_dim = len(tgt_vocab)
hidden_dim = 300
layers = 4
dropout = 0.2

# Initialize models
enc = Encoder(input_dim, hidden_dim, layers, dropout)
dec = Decoder(output_dim, hidden_dim, layers, dropout)
model = Seq2Seq(enc, dec).to(device)



In [None]:
# Optimizer and loss
optimizer = optim.Adam(model.parameters())
PAD_IDX = src_vocab.stoi['<pad>']
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import numpy as np

def tokenize_for_bleu(sentence, vocab):
    """Convert numerical tokens to words, removing special tokens"""
    return [vocab.itos[token] for token in sentence
            if vocab.itos[token] not in ['<pad>', '<sos>', '<eos>']]

def calculate_bleu(trg, preds, max_n=4):
    model.eval()
    targets = []
    predictions = []
    smoothing = SmoothingFunction().method1  # Smoothing for short sentences

    for i in range(trg.shape[0]):
        trg_tokens = tokenize_for_bleu(trg[i].cpu().numpy(), tgt_vocab)
        pred_tokens = tokenize_for_bleu(preds[i].cpu().numpy(), tgt_vocab)

        targets.append([trg_tokens])
        predictions.append(pred_tokens)

    return corpus_bleu(targets, predictions, smoothing_function=smoothing)

In [None]:
# Train loop
def train(model, train_loader, clip, t):

    model.teacher_forcing = 0.7 - (t/100)
    model.train()

    device = next(model.parameters()).device

    for src, trg in train_loader:
        src = src.to(device)
        trg = trg.to(device)

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output.reshape(-1, output_dim)
        trg = trg.reshape(-1)

        optimizer.zero_grad()
        loss = loss_fn(output[1:], trg[1:])
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()



In [None]:
# Test loop
def test(model, test_loader):
    model.eval()
    t_loss = 0
    avg_bleu = 0
    t_bleu = 0

    device = next(model.parameters()).device

    model.teacher_forcing = 0
    with torch.inference_mode():
        for src, trg in test_loader:
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg)

            output_dim = output.shape[-1]
            loss_output = output.reshape(-1, output_dim)
            loss_trg = trg.reshape(-1)

            loss = loss_fn(loss_output[1:], loss_trg[1:])
            t_loss += loss.item()

            predictions = output.argmax(dim=2)

            t_bleu += calculate_bleu(trg, predictions)



    avg_loss = t_loss / len(test_loader)
    avg_bleu = t_bleu / len(test_loader)
    return avg_bleu*100, avg_loss

In [None]:
epochs = 50
for t in range(epochs):
  train(model, train_loader, 1, t)
  if t % 5 == 0:
    bleu, loss = test(model, test_loader)
    print(f"Epoch: {t+1}---------------------------------,\n BLEU score: {bleu:.2f}, Avg loss: {loss:.5f}")

Epoch: 1---------------------------------,
 BLEU score: 0.21, Avg loss: 6.84835
Epoch: 6---------------------------------,
 BLEU score: 0.95, Avg loss: 6.31120
Epoch: 11---------------------------------,
 BLEU score: 3.08, Avg loss: 6.11212
Epoch: 16---------------------------------,
 BLEU score: 5.72, Avg loss: 6.20893
Epoch: 21---------------------------------,
 BLEU score: 7.80, Avg loss: 6.41080
Epoch: 26---------------------------------,
 BLEU score: 10.10, Avg loss: 6.59285
Epoch: 31---------------------------------,
 BLEU score: 11.89, Avg loss: 6.80722
Epoch: 36---------------------------------,
 BLEU score: 12.87, Avg loss: 6.98391
Epoch: 41---------------------------------,
 BLEU score: 14.06, Avg loss: 7.21200
Epoch: 46---------------------------------,
 BLEU score: 14.49, Avg loss: 7.43968
