In [None]:
!pip install torchdata 



## Data Pre-processing



In [None]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Pytorch version is: ", torch.__version__)
print("You are using: ", DEVICE)

Pytorch version is:  1.11.0+cu113
You are using:  cuda


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor

from torchtext.datasets import Multi30k

from typing import Tuple, List

import random
import math
import os
import time

SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.deterministic = True

class Placeholder:
    @property
    def DO(self):
        raise NotImplementedError("You haven't yet implemented this part of the assignment yet")

TO = Placeholder()

In [None]:
%%capture
! python -m spacy download en
! python -m spacy download de
from torchtext.data.utils import get_tokenizer
de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

In [None]:
train_data, valid_data, test_data = Multi30k()
from torchtext.vocab import build_vocab_from_iterator

de_generator = (de_tokenizer(pair[0].strip().lower()) for pair in Multi30k(split="train"))
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]
de_vocab = build_vocab_from_iterator(de_generator, specials=specials, min_freq=2)
en_generator = (en_tokenizer(pair[1].strip().lower()) for pair in Multi30k(split="train"))
en_vocab = build_vocab_from_iterator(en_generator, specials=specials, min_freq=2)

for vocab in (de_vocab, en_vocab):
    vocab.set_default_index(vocab["<unk>"])



In [None]:
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

def data_process(raw_dataset) -> List[Tuple[Tensor, Tensor]]:
    ret = []
    for pair in raw_dataset: 
      # lower case and strip both German and English tokens in each sentence pair 
      d_tokens = de_tokenizer(pair[0].strip().lower()) 
      e_tokens = de_tokenizer(pair[1].strip().lower()) 

      # add <bos> and <eos> tokens to both German and English sentences 
      d_tokens.insert(0, '<bos>')
      d_tokens.append('<eos>')
      e_tokens.insert(0, '<bos>')
      e_tokens.append('<eos>') 

      # get encoded tensor tuple from vocabs  
      d_tens = torch.tensor([de_vocab[token] for token in d_tokens], dtype=torch.long)
      e_tens = torch.tensor([en_vocab[token] for token in e_tokens], dtype=torch.long)
      tup = (d_tens, e_tens)

      # add tensor tuple to list 
      ret.append(tup) 

    return ret 

train_data, valid_data, test_data = Multi30k()
train_data_processed = data_process(train_data)
valid_data_processed = data_process(valid_data)
test_data_processed = data_process(test_data)



In [None]:
# Making sure German isn't reversed
de_itos = de_vocab.get_itos()
en_itos = en_vocab.get_itos()
de_encoded, en_encoded = train_data_processed[0]
print(" ".join([de_itos[item] for item in de_encoded]))
print(" ".join([en_itos[item] for item in en_encoded]))

<bos> zwei junge weiße männer sind im freien in der nähe vieler büsche . <eos>
<bos> two young , white males are outside near many bushes . <eos>


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']

def collate_fn(data_batch) -> Tuple[Tensor, Tensor]: 
    # initialize lists to be added to returned tuple 
    de_batch = [] 
    en_batch = [] 
    for pair in data_batch: 
      de = pair[0]
      en = pair[1] 
      # add indices/lengths to corresponding batches/list
      # de_tens = torch.cat([torch.tensor([BOS_IDX]), de, torch.tensor([EOS_IDX])], 0)
      # en_tens = torch.cat([torch.tensor([BOS_IDX]), en, torch.tensor([EOS_IDX])], 0)
      de_batch.append(de)
      en_batch.append(en)
    # pad sequences 
    # send to GPU 
    de_batch = torch.tensor(pad_sequence(de_batch, padding_value=PAD_IDX, batch_first=True)).to(DEVICE) 
    en_batch = torch.tensor(pad_sequence(en_batch, padding_value=PAD_IDX, batch_first=True)).to(DEVICE) 
    ret = (de_batch, en_batch) 

    return ret 

train_dl = DataLoader(
    train_data_processed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)
valid_dl = DataLoader(
    valid_data_processed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)
test_dl = DataLoader(
    test_data_processed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

# Transformer Implementation

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(
        self,
        d_model: int,
        max_len: int = 512,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.full((max_len, d_model), -100.0)
        pos = torch.arange(max_len).unsqueeze(1)
        indxs = torch.arange(d_model // 2).unsqueeze(0) * 2
        denom = 10000 ** (indxs / d_model)
        pe[:, ::2] = torch.sin(pos / denom)
        pe[:, 1::2] = torch.cos(pos / denom)
        self.pe = pe.unsqueeze(0).to(DEVICE)  

    def forward(self, embed):
        return self.dropout(embed + self.pe[:, embed.size(1)])

In [None]:
LOG_PROB_ZERO = -1e9


def get_padding_mask(seq, pad_idx=0, dtype=torch.float32):
    mask = (seq == pad_idx).type(dtype)
    mask *= LOG_PROB_ZERO
    mask.unsqueeze_(1).unsqueeze_(1)
    return mask.to(DEVICE)


def get_lookahead_mask(n):
    mask = torch.full((n, n), -LOG_PROB_ZERO)
    mask = torch.tril(mask)
    mask += LOG_PROB_ZERO
    mask = mask.unsqueeze(0)  
    return mask.to(DEVICE)


In [None]:
def scaled_dot_product_attention(q, k, v, mask=None): 
    att = torch.matmul(q, k.transpose(-2,-1))/math.sqrt(k.size(-1)) 
    if mask is not None: 
      att += mask 
    att = F.softmax(att, dim=-1) 
    ret = torch.matmul(att, v) 
    return ret, att

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()

        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.depth = d_model // n_heads

        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        self.wq = nn.Linear(d_model, d_model)

        self.fc = nn.Linear(d_model, d_model)

    def split_heads(self, x): 
        batch_size, seq_length, _ = x.size()
        x = x.reshape(batch_size, seq_length, self.n_heads, self.depth)
        x = x.transpose(1,2) 
        return x 

    def merge_heads(self, x): 
        batch_size, _, seq_length, d_model = x.shape 
        x = x.transpose(1,2) 
        x = x.reshape(x.shape[0], x.shape[1], -1) 
        return x  

    def forward(self, q, k, v, masks):
        k = self.wk(k)
        v = self.wv(v)
        q = self.wq(q)
        k = self.split_heads(k)
        v = self.split_heads(v)
        q = self.split_heads(q)
        attn, attn_weights = scaled_dot_product_attention(q, k, v, masks)
        attn = self.merge_heads(attn)
        out = self.fc(attn)
        return out, attn_weights

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, dropout):
        super().__init__()
        assert d_model % num_heads == 0
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.mha_dropout = nn.Dropout(dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)

        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model)
        )
        self.ff_dropout = nn.Dropout(dropout)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.d_model = d_model

    def forward(self, x, padding_mask): 
        identity = x
        x, _ = self.mha(x, x, x, padding_mask)
        x = self.mha_dropout(x)
        x += identity
        x = self.layer_norm1(x)

        x = self.ff(x) 
        x = self.ff_dropout(x)
        x += identity 
        x = self.layer_norm2(x)
        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, n_layers, dropout):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                EncoderLayer(num_heads, d_model, d_ff, dropout)
                for _ in range(n_layers)
            ]
        )
        self.d_model = d_model

    def forward(self, x, padding_mask): 
        assert x.shape[2] == self.d_model
        assert x.shape[0] <= BATCH_SIZE
        for layer in self.layers:
            x = layer(x, padding_mask)
        return x

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, dropout):
        super().__init__()
        assert d_model % num_heads == 0
        self.masked_mha = MultiHeadAttention(d_model, num_heads)
        self.masked_mha_dropout = nn.Dropout(dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.mha_dropout = nn.Dropout(dropout)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model)
        )
        self.ff_dropout = nn.Dropout(dropout)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.d_model = d_model

    def forward(self, x, enc_output, lookahead_mask, padding_mask): 
        # layer 1 - masked multi-head attention 
        identity = x 
        x, _ = self.masked_mha(x, x, x, lookahead_mask)
        x = self.masked_mha_dropout(x)
        x += identity
        x = self.layer_norm1(x)

        # layer 2 - multi-head attention
        # get queries from previous decoder layer 
        # get keys and values from encoder output 
        identity = x 
        k = enc_output 
        v = enc_output 
        x, _ = self.mha(x, k, v, padding_mask)
        x = self.mha_dropout(x)
        x += identity 
        x = self.layer_norm2(x)

        # layer 3 - feedforward 
        identity = x 
        x = self.ff(x)
        x = self.ff_dropout(x)
        x += identity 
        x = self.layer_norm3(x)
        return x 
        

In [None]:
class Decoder(nn.Module):
    def __init__(self, num_heads, d_model, d_ff, n_layers, dropout):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                DecoderLayer(num_heads, d_model, d_ff, dropout)
                for _ in range(n_layers)
            ]
        )
        self.d_model = d_model

    def forward(self, x, enc_output, lookahead_mask, padding_mask): 
        for layer in self.layers:
            x = layer(x, enc_output, lookahead_mask, padding_mask)
        return x

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        tgt_vocab_size,
        num_heads,
        d_model,
        d_ff,
        n_enc_layers,
        n_dec_layers,
        max_len=1024,
        pad_idx=0,
        dropout=0.1,
    ):
        super().__init__()
        self.encoder = Encoder(num_heads, d_model, d_ff, n_enc_layers, dropout)
        self.decoder = Decoder(num_heads, d_model, d_ff, n_dec_layers, dropout)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.in_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.out_linear = nn.Linear(d_model, tgt_vocab_size)
        self.pad_idx = pad_idx
        self.d_model = d_model

    def forward(self, x, y):
        input_padding_mask, lookahead_mask = self._get_masks(x, y) 

        x = self.in_embedding(x) 
        x = self.pos_encoding(x) 
        enc_output = self.encoder(x, input_padding_mask)
        
        y = self.tgt_embedding(y)
        y = self.pos_encoding(y)
        dec_output = self.decoder(y, enc_output, lookahead_mask, input_padding_mask) 
        out = self.out_linear(dec_output)
        return out

    def _get_masks(self, x, y):
        input_padding_mask = get_padding_mask(x, pad_idx=self.pad_idx)
        target_padding_mask = get_padding_mask(y, pad_idx=self.pad_idx)
        lookahead_mask = get_lookahead_mask(y.shape[-1])
        lookahead_mask = torch.minimum(target_padding_mask, lookahead_mask)
        return input_padding_mask, lookahead_mask

In [None]:
NUM_HEADS = 8
D_MODEL = 256
D_FF = 1024
N_ENC_LAYERS = 6
N_DEC_LAYERS = 6

model = Transformer(
    len(de_vocab), 
    len(en_vocab), 
    num_heads=NUM_HEADS,
    d_model=D_MODEL,
    d_ff=D_FF,
    n_enc_layers=N_ENC_LAYERS,
    n_dec_layers=N_DEC_LAYERS,
).to(DEVICE)
optimizer = optim.Adam(model.parameters())
PAD_IDX = de_vocab["<pad>"]
assert PAD_IDX == en_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
class AdamWrapper:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self, *args, **kwargs):
        return self.optimizer.zero_grad(*args, **kwargs)

optimizer = AdamWrapper(D_MODEL, 1, 2000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [None]:
# training loop 
def train(model, train_dl, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, (src, tgt) in enumerate(train_dl):
        model_sees = tgt[:, :-1]
        loss_sees = tgt[:, 1:]
        optimizer.zero_grad()
        logits = model(src, model_sees)
        loss = criterion(
            logits.reshape(-1, logits.shape[-1]), loss_sees.reshape(-1)
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(train_dl)


def evaluate(model, val_dl, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in val_dl:
            model_sees = tgt[:, :-1]
            loss_sees = tgt[:, 1:]
            logits = model(src, model_sees)
            loss = criterion(
                logits.reshape(-1, logits.shape[-1]), loss_sees.reshape(-1)
            )
            epoch_loss += loss.item()

    return epoch_loss / len(val_dl)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 5 
CLIP = 10
SAVE_DIR = "models"
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, "cpsc477_hw4_transformer.pt")

best_valid_loss = float("inf")

if not os.path.isdir(f"{SAVE_DIR}"):
    os.makedirs(f"{SAVE_DIR}")

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_dl, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_dl, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(
        f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}"
    )
    print(
        f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}"
    )




Epoch: 01 | Time: 0m 33s
	Train Loss: 6.097 | Train PPL: 444.405
	 Val. Loss: 4.329 |  Val. PPL:  75.853
Epoch: 02 | Time: 0m 33s
	Train Loss: 3.900 | Train PPL:  49.383
	 Val. Loss: 3.351 |  Val. PPL:  28.538
Epoch: 03 | Time: 0m 33s
	Train Loss: 3.207 | Train PPL:  24.695
	 Val. Loss: 2.829 |  Val. PPL:  16.933
Epoch: 04 | Time: 0m 33s
	Train Loss: 2.714 | Train PPL:  15.083
	 Val. Loss: 2.478 |  Val. PPL:  11.914
Epoch: 05 | Time: 0m 33s
	Train Loss: 2.358 | Train PPL:  10.568
	 Val. Loss: 2.242 |  Val. PPL:   9.409


In [None]:
def translate(
    model, de_vocab, de_tokenizer, en_vocab, sentence, max_length=512
):
    def process_sentence(sentence, vocab, tokenizer):
        tokens = tokenizer(sentence.strip().lower())
        return torch.tensor(
            [BOS_IDX] + [vocab[token] for token in tokens] + [EOS_IDX],
            dtype=torch.long,
        )
    encoded_input = process_sentence(sentence, de_vocab, de_tokenizer)
    encoded_input = encoded_input.unsqueeze(0).to(DEVICE) 
    output = torch.full((1, 1), BOS_IDX).type(torch.long).to(DEVICE)
    model.eval()
    end_slice_i = None
    for _ in range(max_length - 1):
        logits = model(encoded_input, output)
        next_id = torch.argmax(logits[:, -1, :], dim=-1)
        next_id.unsqueeze_(0)
        output = torch.cat((output, next_id), dim=1)
        if next_id == EOS_IDX:
            end_slice_i = -1
            break
    output.squeeze_(0) 
    output = output.tolist()
    tokens = en_vocab.lookup_tokens(output[1:end_slice_i])
    return " ".join(tokens) 

In [None]:
test_data = Multi30k(split="test")

for _, (de, en) in zip(range(3), test_data):
    translated = translate(
    model,
    de_vocab,
    de_tokenizer,
    en_vocab,
    de
    )
    print(f"German: {de.strip()}")
    print(f"Ground truth: {en.strip()}")
    print(f"Translation: {translated}")
    print("")



German: Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
Ground truth: A man in an orange hat starring at something.
Translation: a man with an orange hat , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , and a man with a <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 