## Get the data file

In [1]:
device = 'cuda'


In [4]:
file_path =  "spa.txt"

with open(file_path, encoding="utf-8") as f:
    lines = f.read().strip().split("\n")

In [5]:
def clean_pair(line):
    parts = line.strip().split("\t")

    if len(parts) < 2:
        print(f"PROBLEM: {line}")
        return None
    eng = parts[0].strip()
    spa = parts[1].strip().split("CC-BY")[0].strip()  # Remove attribution text
    return eng, spa

In [6]:
clean_pairs = [clean_pair(line) for line in lines]
clean_pairs = [pair for pair in clean_pairs if pair is not None]
clean_pairs = clean_pairs #TEMP FOR TESTING (WILL BRICK PC)

PROBLEM: The last piece of cake wa


In [7]:
clean_pairs[:10]

[('Go.', 'Ve.'),
 ('Go.', 'Vete.'),
 ('Go.', 'Vaya.'),
 ('Go.', 'Váyase.'),
 ('Hi.', 'Hola.'),
 ('Run!', '¡Corre!'),
 ('Run!', '¡Corran!'),
 ('Run!', '¡Huye!'),
 ('Run!', '¡Corra!'),
 ('Run!', '¡Corred!')]

## Simple Word Tokenizer

In [8]:
from itertools import chain

In [9]:
# Vocab objects
split_eng_sent = [["<SOS>"] + eng_sent.lower().strip().split() + ["<EOS>"]
                  for eng_sent, spa_sent in clean_pairs]


split_spa_sent = [["<SOS>"] + spa_sent.lower().strip().split() + ["<EOS>"]
                  for eng_sent, spa_sent in clean_pairs]

eng_vocab = set(chain.from_iterable(split_eng_sent))
spa_vocab = set(chain.from_iterable(split_spa_sent))

vocab_to_ind_eng = {word: i for i, word in enumerate(eng_vocab)}
vocab_to_ind_spa = {word: i for i, word in enumerate(spa_vocab)}

ind_to_word_eng = {i:w for w, i in vocab_to_ind_eng.items()}
ind_to_word_spa = {i:w for w, i in vocab_to_ind_spa.items()}

In [10]:
#Encode the sentences
def encode_sentence_vectors(sentence: list, vocab_to_ind: dict):
    return [vocab_to_ind[word] for word in sentence]


eng_senteces_encoded = [encode_sentence_vectors(sentence, vocab_to_ind_eng)
                        for sentence in split_eng_sent]

spa_senteces_encoded = [encode_sentence_vectors(sentence, vocab_to_ind_spa)
                        for sentence in split_spa_sent]

In [11]:
print(split_eng_sent[0])
print(eng_senteces_encoded[0])

['<SOS>', 'go.', '<EOS>']
[8657, 8050, 17370]


## Create a padded dataset

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

def pad(seq, max_len):
    return seq + [0]*(max_len - len(seq))

max_eng_len = max(len(s) for s in eng_senteces_encoded)
max_spa_len = max(len(s) for s in spa_senteces_encoded)

padded_eng = [pad(s, max_eng_len) for s in eng_senteces_encoded]
padded_spa = [pad(s, max_spa_len) for s in spa_senteces_encoded]

class TranslationDataset(Dataset):
    def __init__(self, eng, spa):
        self.eng = eng
        self.spa = spa

    def __len__(self):
        return len(self.eng)

    def __getitem__(self, idx):
        return torch.tensor(self.eng[idx]), torch.tensor(self.spa[idx])


#Just normal batching here
dl = DataLoader(TranslationDataset(padded_eng, padded_spa), batch_size=32, shuffle=True)


## A model

In [13]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -math.log(10000.0) / d_model)
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])


In [14]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, emb_size, nhead,
                 src_vocab_size, tgt_vocab_size, dim_feedforward=512, dropout=0.1):
        super().__init__()
        self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout)

        self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask, src_pad_mask, tgt_pad_mask, mem_pad_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        out = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask,
                               None, src_pad_mask, tgt_pad_mask, mem_pad_mask)
        return self.generator(out)


In [15]:
def generate_square_subsequent_mask(sz, device):
    return torch.triu(torch.full((sz, sz), float('-inf'), device=device), 1)

def create_mask(src, tgt, pad_idx=0):
    src_seq_len = src.size(0)
    tgt_seq_len = tgt.size(0)
    device = src.device  # assume both src and tgt are on the same device

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

    src_padding_mask = (src == pad_idx).transpose(0, 1)
    tgt_padding_mask = (tgt == pad_idx).transpose(0, 1)

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask


In [16]:
from itertools import chain

# Initialize model with your vocab sizes
src_vocab_size = len(vocab_to_ind_eng)
tgt_vocab_size = len(vocab_to_ind_spa)

model = Seq2SeqTransformer(
    num_encoder_layers=3,
    num_decoder_layers=3,
    emb_size=256,
    nhead=8,
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
)

# One batch from your DataLoader
for eng_batch, spa_batch in dl:
    # Transpose to [seq_len, batch_size] for Transformer
    eng_batch = eng_batch.transpose(0, 1)
    spa_batch = spa_batch.transpose(0, 1)

    # Create masks (pad_idx = 0)
    src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(
        eng_batch, spa_batch, pad_idx=0
    )

    # Forward pass
    output = model(
        eng_batch,
        spa_batch,
        src_mask,
        tgt_mask,
        src_pad_mask,
        tgt_pad_mask,
        src_pad_mask  # memory key padding mask
    )

    print("Output shape:", output.shape)  # [tgt_seq_len, batch_size, tgt_vocab_size]
    break  # only one batch




Output shape: torch.Size([21, 32, 36000])




## Training loop

In [17]:
def train_model(model, dataloader, optimizer, loss_fn, num_epochs, pad_idx=0, device='cpu'):
    model.to(device)
    model.train()
    losses = []

    for epoch in range(num_epochs):
        total_loss = 0
        print(f"Starting epoch {epoch}")
        dl_len = len(dataloader)
        i = 1

        for eng_batch, spa_batch in dataloader:
            print(f"\rStarting {i}/{dl_len}", end="")
            i += 1
            eng_batch = eng_batch.transpose(0, 1).to(device)  # [seq_len, batch]
            spa_batch = spa_batch.transpose(0, 1).to(device)  # [seq_len, batch]

            tgt_input = spa_batch[:-1, :]
            tgt_output = spa_batch[1:, :]

            src_mask, tgt_mask, src_pad_mask, tgt_pad_mask = create_mask(eng_batch, tgt_input, pad_idx)

            logits = model(
                eng_batch,
                tgt_input,
                src_mask,
                tgt_mask,
                src_pad_mask,
                tgt_pad_mask,
                src_pad_mask  # memory padding mask
            )

            optimizer.zero_grad()

            # Flatten predictions and targets
            logits_flat = logits.reshape(-1, logits.shape[-1])
            tgt_output_flat = tgt_output.reshape(-1)

            loss = loss_fn(logits_flat, tgt_output_flat)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs} — Loss: {avg_loss:.4f}")
        losses.append(avg_loss)

    return losses


In [18]:
model = Seq2SeqTransformer(
    num_encoder_layers=3,
    num_decoder_layers=3,
    emb_size=256,
    nhead=8,
    src_vocab_size=len(vocab_to_ind_eng),
    tgt_vocab_size=len(vocab_to_ind_spa)
)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

# Train it
losses = train_model(model, dl, optimizer, loss_fn, num_epochs=5, pad_idx=0, device='cuda')

# prompt: save model weights

torch.save(model.state_dict(), "model_weights.pth")
print("Model weights saved to model_weights.pth")



Starting epoch 0
Starting 3555/3555Epoch 1/5 — Loss: 4.5892
Starting epoch 1
Starting 3555/3555Epoch 2/5 — Loss: 3.2032
Starting epoch 2
Starting 3555/3555Epoch 3/5 — Loss: 2.5553
Starting epoch 3
Starting 3555/3555Epoch 4/5 — Loss: 2.1250
Starting epoch 4
Starting 3555/3555Epoch 5/5 — Loss: 1.8292
Model weights saved to model_weights.pth


In [1]:
import matplotlib.pyplot as plt

plt.plot(losses)
plt.title("Training Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.show()


NameError: name 'losses' is not defined

In [None]:
def greedy_decode(model, src_sentence, src_vocab, tgt_vocab, ind_to_word_spa, max_len=50, device='cuda'):
    model.eval()

    sos_token = tgt_vocab["<SOS>"]
    eos_token = tgt_vocab["<EOS>"]
    pad_token = tgt_vocab.get("<PAD>", 0)

    # Encode source sentence
    tokens = ["<SOS>"] + src_sentence.lower().strip().split() + ["<EOS>"]
    src_indices = [src_vocab.get(tok, src_vocab.get("<UNK>", 0)) for tok in tokens]
    src_tensor = torch.tensor(src_indices).unsqueeze(1).to(device)  # [seq_len, 1]
    src_mask = torch.zeros((src_tensor.size(0), src_tensor.size(0)), device=device).type(torch.bool)

    # Encoder output
    src_emb = model.positional_encoding(model.src_tok_emb(src_tensor))
    memory = model.transformer.encoder(src_emb, src_mask)

    # Start decoding with <SOS>
    tgt_indices = [sos_token]
    for _ in range(max_len):
        tgt_tensor = torch.tensor(tgt_indices).unsqueeze(1).to(device)
        tgt_mask = generate_square_subsequent_mask(tgt_tensor.size(0), 'cuda')#s.to(device)

        tgt_emb = model.positional_encoding(model.tgt_tok_emb(tgt_tensor))
        out = model.transformer.decoder(tgt_emb, memory, tgt_mask)
        out = model.generator(out)

        next_token = out[-1].argmax(-1).item()

        if next_token == eos_token:
            break

        tgt_indices.append(next_token)

    decoded = [ind_to_word_spa.get(idx, "") for idx in tgt_indices[1:]]  # skip SOS
    return " ".join(decoded)


In [None]:
def translate(eng_sentence, model, src_vocab, tgt_vocab, ind_to_word_spa):
    return greedy_decode(model, eng_sentence, src_vocab, tgt_vocab, ind_to_word_spa)


In [None]:
print(translate("I'm very excited to have dinner with you", model, vocab_to_ind_eng, vocab_to_ind_spa, ind_to_word_spa))

NameError: name 'translate' is not defined