In [3]:
import sys
sys.path.append('../src')

from model.transformer import Transformer

In [2]:
import pandas as pd
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn.functional as F
import re
from tqdm.auto import tqdm
from tqdm import tqdm

## Data Processing

In [5]:
train_data = pd.read_csv("../src/data/translation_train.csv")

test_data = pd.read_csv("../src/data/translation_test.csv")

In [6]:
def pad_sequences(seq, max_len, pad_token='#'):
    return seq + [pad_token] * (max_len - len(seq))

def clean_text(text: str) -> str:
    text = text.replace('\xa0', '').replace('_', '').replace('\t', '')
    return re.sub(r'[^\w\s]', '', text).lower().strip()

def tokenize(data):
    new_data = []
    eng_maxlen = 0
    ger_maxlen = 0

    for _, row in data.iterrows():
        eng = clean_text(row.iloc[0])
        ger = clean_text(row.iloc[1])

        eng_tokens = ['<'] + list(eng) + ['>']
        ger_tokens = ['<'] + list(ger) + ['>']

        eng_maxlen = max(eng_maxlen, len(eng_tokens))
        ger_maxlen = max(ger_maxlen, len(ger_tokens))

        new_data.append({"en": eng_tokens, "ge": ger_tokens})

    # Apply padding
    for it in new_data:
        it['en'] = pad_sequences(it['en'], eng_maxlen)
        it['ge'] = pad_sequences(it['ge'], ger_maxlen)

    return new_data, eng_maxlen, ger_maxlen

def get_vocab_separate(data):
    en_vocab = {'#': 0, '<': 1, '>': 2}
    ge_vocab = {'#': 0, '<': 1, '>': 2}

    en_index = 3
    ge_index = 3

    for it in data:
        for tok in it['en']:
            if tok not in en_vocab:
                en_vocab[tok] = en_index
                en_index += 1
        for tok in it['ge']:
            if tok not in ge_vocab:
                ge_vocab[tok] = ge_index
                ge_index += 1

    return en_vocab, ge_vocab


def embed_tokens(data, ge_vocab_dict, en_vocab_dict):
    new_data = []
    for it in data:
        en_tokens = [en_vocab_dict.get(tok) for tok in it['en']]
        ge_tokens = [ge_vocab_dict.get(tok) for tok in it['ge']]
        new_data.append({'en': en_tokens, 'ge': ge_tokens})
    return new_data

In [7]:
tokenized_data, eng_maxlen, ger_maxlen = tokenize(train_data)
en_vocab_dict, ge_vocab_dict = get_vocab_separate(tokenized_data)
embedded_data = embed_tokens(tokenized_data, ge_vocab_dict, en_vocab_dict)

print("English Vocabulary Size:", len(en_vocab_dict))
print("German Vocabulary Size:", len(ge_vocab_dict))
print("Max English Sequence Length:", eng_maxlen)
print("Max German Sequence Length:", ger_maxlen)

English Vocabulary Size: 40
German Vocabulary Size: 48
Max English Sequence Length: 203
Max German Sequence Length: 249


In [8]:
class CharTranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return torch.tensor(item['en']), torch.tensor(item['ge'])

In [9]:
dataset = CharTranslationDataset(embedded_data)

## Model

In [10]:
model = Transformer(
    encoder_vocab_size=len(en_vocab_dict),
    decoder_vocab_size=len(ge_vocab_dict),
    embed_dim=32,
    num_heads=4,
    ff_hidden_dim=64,
    num_layers=3,
    max_len=max(eng_maxlen, ger_maxlen)
)

## Training

In [14]:
def train(model, dataset, batch_size=32, epochs=10, lr=1e-4, device='cpu'):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss(ignore_index=0) 

    model.train()
    
    for epoch in range(epochs):
        total_loss = 0

        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for src, tgt in loop:
            src = src.to(device)
            tgt = tgt.to(device)

            dec_input = tgt[:, :-1]
            target = tgt[:, 1:]

            # Forward pass
            output = model(src, dec_input)
            output = output.reshape(-1, output.size(-1))
            target = target.reshape(-1)

            loss = loss_fn(output, target)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataset, batch_size=32, epochs=1, lr=1e-4, device=device)

## Inference

In [17]:
def translate(model, sentence, en_vocab_dict, ge_vocab_dict, device='cpu', max_len=50):
    model.eval()
    with torch.inference_mode():
        tokens = ['<'] + list(sentence) + ['>']
        token_ids = [en_vocab_dict.get(tok, 0) for tok in tokens]
        src_tensor = torch.tensor(token_ids).unsqueeze(0).to(device)

        src_embed = model.encoder_embedding(src_tensor)
        src_embed = model.encoder_pos(src_embed)
        enc_out = src_embed
        for layer in model.encoder_layers:
            enc_out = layer(enc_out)

        dec_input = torch.tensor([[ge_vocab_dict['<']]]).to(device)

        for _ in range(max_len):
            tgt_embed = model.decoder_embedding(dec_input)
            tgt_embed = model.decoder_pos(tgt_embed)

            dec_out = tgt_embed
            for layer in model.decoder_layers:
                dec_out = layer(dec_out, enc_out)

            logits = model.output_projection(dec_out)
            next_token = logits[0, -1].argmax(-1).item()

            dec_input = torch.cat([dec_input, torch.tensor([[next_token]]).to(device)], dim=1)

            if next_token == ge_vocab_dict['>']:
                break

        output_tokens = dec_input.squeeze().tolist() 
        idx2word = {v: k for k, v in ge_vocab_dict.items()}
        translated = ''.join([idx2word[tok] for tok in output_tokens[1:-1]])

    return translated