In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import sentencepiece as spm
import wandb
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict


In [None]:
# ======================
# 1. Embedding + Positional Encoding
# ======================

class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embed(x)


class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=200):
        super().__init__()

        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i]   = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i+1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]



In [None]:
# ======================
# 2. Multi-head Attention
# ======================

def attention(q, k, v, mask=None, dropout=None):
    d_k = q.size(-1)

    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    scores = F.softmax(scores, dim=-1)

    if dropout:
        scores = dropout(scores)

    return torch.matmul(scores, v)


class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()

        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):

        bs = q.size(0)

        # linear projection + split into heads
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k).transpose(1,2)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k).transpose(1,2)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k).transpose(1,2)

        # apply attention
        scores = attention(q, k, v, mask, self.dropout)

        # concat heads
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)

        # output projection
        return self.out(concat)



In [None]:
# ======================
# 3. Feed Forward Network
# ======================

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        return self.linear_2(x)


# ======================
# 4. Normalization Layer
# ======================

class Norm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.size = d_model

        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias  = nn.Parameter(torch.zeros(self.size))
        self.eps = eps

    def forward(self, x):
        norm = x.mean(-1, keepdim=True)
        std  = x.std(-1, keepdim=True)
        return self.alpha * (x - norm) / (std + self.eps) + self.bias


# ======================
# 5. Encoder Layer
# ======================

class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)

        self.attention = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attention(x2, x2, x2, mask))

        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))

        return x


# ======================
# 6. Decoder Layer
# ======================

class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()

        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)

        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)

        self.ff = FeedForward(d_model, dropout=dropout)

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        x2 = self.norm_1(x)
        x  = x + self.dropout_1(self.attn_1(x2, x2, x2, tgt_mask))

        x2 = self.norm_2(x)
        x  = x + self.dropout_2(self.attn_2(x2, enc_out, enc_out, src_mask))

        x2 = self.norm_3(x)
        x  = x + self.dropout_3(self.ff(x2))

        return x


# ======================
# 7. Full Encoder
# ======================

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout=0.1):
        super().__init__()

        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)

        self.layers = nn.ModuleList([
            EncoderLayer(d_model, heads, dropout) for _ in range(N)
        ])

        self.norm = Norm(d_model)

    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)

        for i in range(self.N):
            x = self.layers[i](x, mask)

        return self.norm(x)


# ======================
# 8. Full Decoder
# ======================

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout=0.1):
        super().__init__()

        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)

        self.layers = nn.ModuleList([
            DecoderLayer(d_model, heads, dropout) for _ in range(N)
        ])

        self.norm = Norm(d_model)

    def forward(self, tgt, enc_out, src_mask, tgt_mask):
        x = self.embed(tgt)
        x = self.pe(x)

        for i in range(self.N):
            x = self.layers[i](x, enc_out, src_mask, tgt_mask)

        return self.norm(x)



In [None]:

# ======================
# 9. Seq2Seq Transformer
# ======================

class Transformer(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model=256, N=6, heads=8, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(tgt_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, tgt_vocab)

    def forward(self, src, tgt, src_mask, tgt_mask):
        e = self.encoder(src, src_mask)
        d = self.decoder(tgt, e, src_mask, tgt_mask)
        return self.out(d)


### SENTENCEPIECE TOKENIZER

In [None]:
class SentencePieceTokenizer:
    def __init__(self, model_prefix, vocab_size=8000):
        self.model = None
        self.model_prefix = model_prefix
        self.vocab_size = vocab_size

    def train(self, texts):
        with open(f"{self.model_prefix}.txt", "w", encoding="utf8") as f:
            for t in texts:
                f.write(t.strip() + "\n")

        spm.SentencePieceTrainer.train(
            input=f"{self.model_prefix}.txt",
            model_prefix=self.model_prefix,
            vocab_size=self.vocab_size,
            character_coverage=1.0,
            model_type="bpe",
            bos_id=1,
            eos_id=2,
            pad_id=0,
            unk_id=3
        )
        self.model = spm.SentencePieceProcessor()
        self.model.load(f"{self.model_prefix}.model")

    def encode(self, text, max_len=100):
        ids = self.model.encode(text, out_type=int)
        ids = ids[:max_len]
        return [1] + ids + [2]   # BOS=1, EOS=2

    def decode(self, ids):
        return self.model.decode(ids)

    def vocab_size_(self):
        return self.model.get_piece_size()


### DATASET + COLLATE

In [None]:
class NMTDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tok, tgt_tok, max_len=100):
        self.src = src_texts
        self.tgt = tgt_texts
        self.src_tok = src_tok
        self.tgt_tok = tgt_tok
        self.max_len = max_len

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src_ids = self.src_tok.encode(self.src[idx], self.max_len)
        tgt_ids = self.tgt_tok.encode(self.tgt[idx], self.max_len)
        return torch.LongTensor(src_ids), torch.LongTensor(tgt_ids)

def collate_batch(batch):
    src, tgt = zip(*batch)
    src = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0)
    tgt = nn.utils.rnn.pad_sequence(tgt, batch_first=True, padding_value=0)
    return src, tgt

def make_src_mask(src):
    return (src != 0).unsqueeze(1).unsqueeze(2)

def make_tgt_mask(tgt):
    T = tgt.size(1)
    pad_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
    seq_mask = torch.tril(torch.ones((T, T), device=tgt.device)).bool()
    return pad_mask & seq_mask


### TRAINING + PPL + W&B

In [None]:
from zmq import device


def validate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)

            tgt_in = tgt[:, :-1]
            tgt_out = tgt[:, 1:]

            src_mask = make_src_mask(src)
            tgt_mask = make_tgt_mask(tgt_in)

            logits = model(src, tgt_in, src_mask, tgt_mask)
            logits = logits.reshape(-1, logits.size(-1))
            tgt_out = tgt_out.reshape(-1)

            loss = criterion(logits, tgt_out)
            total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    ppl = math.exp(avg_loss)
    return avg_loss, ppl


def train_pipeline(train_src, train_tgt, val_src, val_tgt,
                   vocab_size=8000, batch_size=32, epochs=20):

    # ====== WANDB ======
    wandb.init(project="transformer-nmt", config={
    "vocab_size": vocab_size,
    "d_model": 256,
    "heads": 8,
    "layers": 6,
    "epochs": epochs,
    "batch_size": batch_size
    })


    # ====== TOKENIZER SP ======
    src_tok = SentencePieceTokenizer("sp_src", vocab_size)
    tgt_tok = SentencePieceTokenizer("sp_tgt", vocab_size)
    src_tok.train(train_src)
    tgt_tok.train(train_tgt)

    # ====== DATASET ======
    train_ds = NMTDataset(train_src, train_tgt, src_tok, tgt_tok)
    val_ds   = NMTDataset(val_src,   val_tgt,   src_tok, tgt_tok)

    train_loader = DataLoader(train_ds, batch_size=batch_size,
                              shuffle=True, collate_fn=collate_batch)
    val_loader   = DataLoader(val_ds, batch_size=batch_size,
                              shuffle=False, collate_fn=collate_batch)

    # ====== MODEL ======
    model = Transformer(
    src_tok.vocab_size_(),
    tgt_tok.vocab_size_(),
    d_model=256,
    N=6,
    heads=8
    ).to(device)


    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    best_loss = float("inf")

    # ====== TRAIN LOOP ======
    for ep in range(epochs):
        model.train()
        total_loss = 0

        for src, tgt in train_loader:
            src, tgt = src.to(device), tgt.to(device)

            tgt_in = tgt[:, :-1]
            tgt_out = tgt[:, 1:]

            src_mask = make_src_mask(src)
            tgt_mask = make_tgt_mask(tgt_in)

            logits = model(src, tgt_in, src_mask, tgt_mask)
            loss = criterion(
                logits.reshape(-1, logits.size(-1)),
                tgt_out.reshape(-1)
            )

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        train_loss = total_loss / len(train_loader)

        # ====== VALIDATION ======
        val_loss, ppl = validate(model, val_loader, criterion)

        # ====== LOG W&B ======
        wandb.log({
            "train_loss": train_loss,
            "val_loss": val_loss,
            "perplexity": ppl
        })

        print(f"[Epoch {ep+1}] Train={train_loss:.3f}  Val={val_loss:.3f}  PPL={ppl:.2f}")

        # ====== SAVE BEST ======
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), "best_model.pt")

    return model, src_tok, tgt_tok


In [None]:
import os

def load_iwslt15_text(path):
    train_en = open(os.path.join(path, "train.en"), encoding="utf8").read().splitlines()
    train_vi = open(os.path.join(path, "train.vi"), encoding="utf8").read().splitlines()

    dev_en = open(os.path.join(path, "tst2012.en"), encoding="utf8").read().splitlines()
    dev_vi = open(os.path.join(path, "tst2012.vi"), encoding="utf8").read().splitlines()

    test_en = open(os.path.join(path, "tst2013.en"), encoding="utf8").read().splitlines()
    test_vi = open(os.path.join(path, "tst2013.vi"), encoding="utf8").read().splitlines()

    print("Loaded IWSLT15:")
    print(" - Train:", len(train_en))
    print(" - Dev  :", len(dev_en))
    print(" - Test :", len(test_en))

    return (train_en, train_vi), (dev_en, dev_vi), (test_en, test_vi)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)



cuda


In [None]:

(train_en, train_vi), (dev_en, dev_vi), _ = load_iwslt15_text("/content/drive/MyDrive/Assignment_nlp/data")

model, sp_src, sp_tgt = train_pipeline(
    train_en, train_vi,
    dev_en, dev_vi,
    vocab_size=8000
)

Loaded IWSLT15:
 - Train: 133317
 - Dev  : 1553
 - Test : 1268


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m23020424[0m ([33mvuminhson-vietnam-national-university-hanoi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[Epoch 1] Train=4.546  Val=3.812  PPL=45.26
[Epoch 2] Train=3.562  Val=3.336  PPL=28.11
[Epoch 3] Train=3.144  Val=3.068  PPL=21.49
[Epoch 4] Train=2.858  Val=2.906  PPL=18.28
[Epoch 5] Train=2.642  Val=2.779  PPL=16.10
[Epoch 6] Train=2.474  Val=2.683  PPL=14.63
[Epoch 7] Train=2.339  Val=2.616  PPL=13.68
[Epoch 8] Train=2.229  Val=2.602  PPL=13.50
[Epoch 9] Train=2.135  Val=2.569  PPL=13.06
[Epoch 10] Train=2.054  Val=2.543  PPL=12.72
[Epoch 11] Train=1.982  Val=2.525  PPL=12.49
[Epoch 12] Train=1.918  Val=2.519  PPL=12.42
[Epoch 13] Train=1.859  Val=2.516  PPL=12.38
[Epoch 14] Train=1.807  Val=2.513  PPL=12.34
[Epoch 15] Train=1.758  Val=2.532  PPL=12.58
[Epoch 16] Train=1.712  Val=2.530  PPL=12.55
[Epoch 17] Train=1.669  Val=2.525  PPL=12.49
[Epoch 18] Train=1.631  Val=2.542  PPL=12.71
[Epoch 19] Train=1.594  Val=2.555  PPL=12.87
[Epoch 20] Train=1.559  Val=2.579  PPL=13.18


In [None]:
(train_en, train_vi), (dev_en, dev_vi), (test_en, test_vi) = load_iwslt15_text("/content/drive/MyDrive/Assignment_nlp/data")

test_ds = NMTDataset(test_en, test_vi, sp_src, sp_tgt)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_batch)

test_loss, test_ppl = validate(model, test_loader, nn.CrossEntropyLoss(ignore_index=0))

print("TEST LOSS:", test_loss)
print("TEST PPL:", test_ppl)



Loaded IWSLT15:
 - Train: 133317
 - Dev  : 1553
 - Test : 1268
TEST LOSS: 2.4414097666740417
TEST PPL: 11.489226459645373
