# Task description
- Translate text from Chinese to English.
- Main goal: Get familiar with transformer.

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
cd '/content/drive/MyDrive/113_master/DL/Lab03'

/content/drive/MyDrive/113_master/DL/Lab03


## install the required package

In [22]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.5.1-py3-none-any.whl (890 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.6/890.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.8 torchmetrics-1.5.1


## Import package

In [3]:
import os
import json
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary

## Fix random seed

In [4]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(87)

# Data
- Original dataset is [20k-en-zh-translation-pinyin-hsk](https://huggingface.co/datasets/swaption2009/20k-en-zh-translation-pinyin-hsk)
- We select 50000 English-Chinese sentence pairs for translation task

- Args:
  - BATCH_SIZE
  - data_dir: the path to the given translation dataset
- Tokenizer: BertTokenizer
  - encode: convert text to token ID
  - decode: convert token ID back to text
- Add paddings
  - make all the sentences the same length by inserting token ID = PAD_IDX at the back

In [5]:
data_dir = "./translation_data.json"
BATCH_SIZE = 64

## Show the raw data

In [6]:
translation_raw_data = pd.read_json(data_dir)
translation_raw_data = translation_raw_data
display(translation_raw_data)

Unnamed: 0,english,chinese
0,"Slowly and not without struggle, America began...",美国缓慢地开始倾听，但并非没有艰难曲折。
1,Dithering is a technique that blends your colo...,抖动是关于颜色混合的技术，使你的作品看起来更圆滑，或者只是创作有趣的材质。
2,This paper discusses the petrologic characteri...,本文以珲春早第三纪含煤盆地的地质构违背景为依据，分析了煤系地层的岩石学特征。
3,The second encounter relates to my grandfather...,第二次事件跟我爷爷的宝贝匣子有关。
4,One way to address these challenges would be t...,解决这些挑战的途径包括依照麻瓜在南非的经验设立真相与和解委员会。
...,...,...
49995,You were too obtuse to take the hint.,你太迟钝了， 没有理解这种暗示。
49996,"Therefore, in the event the mortgagee of ship ...",因此，在这种情况下船舶抵押权人放弃了债务人提供的担保就会影响其他担保人的利益，导致抵押权人的...
49997,"Fourth, puncture administrative bloat.",第四，削弱行政膨胀。
49998,Massimo Oddo says he won't be thinking about h...,马西莫。奥多声明他不会在世界杯决赛圈比赛结束之前考虑未来的俱乐部。


## Tokenizer

In [7]:
from transformers import BertTokenizer
tokenizer_en = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer_cn = BertTokenizer.from_pretrained("bert-base-chinese")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [8]:
english_seqs = translation_raw_data["english"].apply(lambda x: tokenizer_en.encode(x, add_special_tokens=True, padding=False))
chinese_seqs = translation_raw_data["chinese"].apply(lambda x: tokenizer_cn.encode(x, add_special_tokens=True, padding=False))

MAX_TOKENIZE_LENGTH = max(english_seqs.str.len().max(),chinese_seqs.str.len().max()) # longest string
MAX_TOKENIZE_LENGTH = pow(2, math.ceil(math.log(MAX_TOKENIZE_LENGTH)/math.log(2)))   # closest upper to the power of 2

print("max tokenize length:", MAX_TOKENIZE_LENGTH)

max tokenize length: 128


## Add paddings

In [9]:
PAD_IDX = 0
BOS_IDX = chinese_seqs.iloc[0][0]
EOS_IDX = chinese_seqs.iloc[0][-1]

def add_padding(token_list, max_length, pad_token = 0):
    if len(token_list) < max_length:
        token_list.extend([pad_token] * (max_length - len(token_list)))
    return token_list

chinese_seqs = chinese_seqs.apply(lambda x: add_padding(x,MAX_TOKENIZE_LENGTH))
english_seqs = english_seqs.apply(lambda x: add_padding(x,MAX_TOKENIZE_LENGTH))

In [10]:
# check the padding result
print("=====chinese tokenized data=====")
print(chinese_seqs.iloc[0])

print("=====english tokenized data=====")
print(english_seqs.iloc[0])

=====chinese tokenized data=====
[101, 5401, 1744, 5353, 2714, 1765, 2458, 1993, 967, 1420, 8024, 852, 2400, 7478, 3766, 3300, 5680, 7410, 3289, 2835, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
=====english tokenized data=====
[101, 13060, 1105, 1136, 1443, 5637, 117, 1738, 1310, 1106, 5113, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Datalodader
- Split dataset into training dataset(90%) and validation dataset(10%).
- Create dataloader to iterate the data.

In [11]:
data_size = len(translation_raw_data)
train_size = int(0.9*data_size)
test_size = data_size - train_size
print("train_size:",train_size)
print("test_size:",test_size)

en_training_data = []
cn_training_data = []
en_testing_data = []
cn_testing_data = []

for i in range(data_size):
    if (i < train_size):
        en_training_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_training_data.append(torch.Tensor(chinese_seqs.iloc[i]))
    else:
        en_testing_data.append(torch.Tensor(english_seqs.iloc[i]))
        cn_testing_data.append(torch.Tensor(chinese_seqs.iloc[i]))


class TextTranslationDataset(Dataset):
    def __init__(self, src, dst):
        self.src_list = src
        self.dst_list = dst

    def __len__(self):
        return len(self.src_list)

    def __getitem__(self, idx):
        return self.src_list[idx], self.dst_list[idx]

cn_to_en_train_set = TextTranslationDataset(cn_training_data, en_training_data)
cn_to_en_test_set = TextTranslationDataset(cn_testing_data, en_testing_data)

cn_to_en_train_loader = DataLoader(cn_to_en_train_set, batch_size=BATCH_SIZE, shuffle=False)
cn_to_en_test_loader = DataLoader(cn_to_en_test_set, batch_size=BATCH_SIZE, shuffle=True)

train_size: 45000
test_size: 5000


# Model
- TO-DO: Finish the model by yourself
- Base transformer layers in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
    - TransformerEncoderLayer:
    - TransformerDecoderLayer:
- Positional encoding and input embedding
- Note that you may need masks when implementing attention mechanism
    - Padding mask: prevent input from attending to padding tokens
    - Causal mask: prevent decoder input from attending to future input

In [145]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, head_num):
        super().__init__()
        assert d_model % head_num == 0, "d_model must be divisible by head_num"

        self.d_model = d_model
        self.head_num = head_num
        self.depth = d_model // head_num

        # Linear layers
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

        self.softmax = nn.Softmax(dim=-1)

    def split_heads(self, x):
        batch_size = x.size(1)  # x shape: (seq_len, batch_size, d_model)
        x = x.permute(1, 0, 2)  # (batch_size, seq_len, d_model)
        x = x.view(batch_size, -1, self.head_num, self.depth)  # (batch_size, seq_len, head_num, depth)
        return x.permute(0, 2, 1, 3)  # (batch_size, head_num, seq_len, depth)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        matmul_qk = torch.matmul(Q, K.transpose(-2, -1))  # (batch_size, head_num, seq_len_q, seq_len_k)
        dk = Q.size(-1)
        scaled_attention_logits = matmul_qk / math.sqrt(dk)

        if mask is not None:
          # mask shape should be broadcastable to (batch_size, head_num, seq_len_q, seq_len_k)
          try:
            scaled_attention_logits = scaled_attention_logits.masked_fill(mask == 0, float('-inf'))
          except:
            print(f"scaled_attention_logits: {scaled_attention_logits.shape}, mask: {mask.shape}")
        attention_weights = self.softmax(scaled_attention_logits)
        attention_output = torch.matmul(attention_weights, V)  # (batch_size, head_num, seq_len_q, depth)
        return attention_output, attention_weights

    def forward(self, Q, K, V, attention_mask=None):
        # Linear projections
        Q = self.W_q(Q)
        K = self.W_k(K)
        V = self.W_v(V)

        # Split heads
        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)

        # Apply attention
        attention_output, attention_weights = self.scaled_dot_product_attention(Q, K, V, mask=attention_mask)

        # Concatenate heads
        batch_size = attention_output.size(0)
        seq_len = attention_output.size(2)
        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
        attention_output = attention_output.view(batch_size, seq_len, -1)  # (batch_size, seq_len, d_model)

        # Final linear layer
        output = self.W_o(attention_output)

        # Return to (seq_len, batch_size, d_model)
        output = output.permute(1, 0, 2)

        return output, attention_weights


In [146]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, dim_feedforward, nhead, dropout = 0.1):
        super().__init__()
        # Multi-head attention using your previous MultiHeadAttention class
        self.self_attn = MultiHeadAttention(d_model=d_model, head_num=nhead)

        # Feed-forward network
        self.linear1 = nn.Linear(d_model, dim_feedforward)  # First linear layer
        self.dropout = nn.Dropout(dropout)                  # Dropout layer
        self.linear2 = nn.Linear(dim_feedforward, d_model)  # Second linear layer

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Additional dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, src_padding_mask=None):
        # Self-attention with source padding mask
        attn_output, _ = self.self_attn(x, x, x, attention_mask=src_padding_mask)

        # Rest of the code remains the same
        x = x + self.dropout1(attn_output)
        x = self.norm1(x)

        # Feed-forward network
        ff_output = self.linear2(self.dropout(torch.relu(self.linear1(x))))

        x = x + self.dropout2(ff_output)
        x = self.norm2(x)

        return x


In [147]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, dim_feedforward, nhead, dropout = 0.1):
        super().__init__()
        # Self-attention for the decoder (with causal masking)
        self.self_attn = MultiHeadAttention(d_model=d_model, head_num=nhead)

        # Cross-attention to attend to the encoder output
        self.cross_attn = MultiHeadAttention(d_model=d_model, head_num=nhead)

        # Feed-forward network
        self.linear1 = nn.Linear(d_model, dim_feedforward)  # First linear layer
        self.dropout = nn.Dropout(dropout)                  # Dropout layer
        self.linear2 = nn.Linear(dim_feedforward, d_model)  # Second linear layer

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        # Additional dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_padding_mask=None, tgt_padding_mask=None, tgt_future_mask=None):
        # Combine target padding mask and future mask
        if tgt_padding_mask is not None and tgt_future_mask is not None:
            tgt_seq_len = tgt_padding_mask.shape[-1]
            # print(f'src_padding_mask: {src_padding_mask.shape}, \ntgt_padding_mask: {tgt_padding_mask.shape}, \ntgt_future_mask: {tgt_future_mask.shape}')
            tgt_padding_mask = tgt_padding_mask.expand(-1, -1, tgt_seq_len, -1)
            # print(f'extended: {tgt_future_mask.shape}')
            attention_mask = tgt_padding_mask & tgt_future_mask  # Logical AND
            # print(f'attention_mask: {attention_mask.shape}')
        elif tgt_padding_mask is not None:
            attention_mask = tgt_padding_mask
        elif tgt_future_mask is not None:
            attention_mask = tgt_future_mask
        else:
            attention_mask = None

        # Self-attention with combined mask
        self_attn_output, _ = self.self_attn(x, x, x, attention_mask=attention_mask)
        x = x + self.dropout1(self_attn_output)
        x = self.norm1(x)

        # Cross-attention with encoder output and source padding mask
        cross_attn_output, _ = self.cross_attn(x, enc_output, enc_output, attention_mask=src_padding_mask)
        x = x + self.dropout2(cross_attn_output)
        x = self.norm2(x)

        # Feed-forward network
        ff_output = self.linear2(self.dropout(torch.relu(self.linear1(x))))
        x = x + self.dropout3(ff_output)
        x = self.norm3(x)

        return x


In [148]:
class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, num_encoder_layers, num_decoder_layers, d_ff, dropout = 0.1):
        super().__init__()
        # Encoder stack
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(d_model=d_model, dim_feedforward=d_ff, nhead=num_heads, dropout=dropout)
            for _ in range(num_encoder_layers)
        ])

        # Decoder stack
        self.decoder_layers = nn.ModuleList([
            TransformerDecoderLayer(d_model=d_model, dim_feedforward=d_ff, nhead=num_heads, dropout=dropout)
            for _ in range(num_decoder_layers)
        ])

        # Final linear layer for projection to output vocabulary size
        self.final_layer = nn.Linear(d_model, d_model)

    def forward(self, src_embeded, tgt_embeded, src_padding_mask, tgt_padding_mask, tgt_future_mask):
        # Encoding the source sequence
        enc_output = self.encode(src_embeded, src_padding_mask=src_padding_mask)

        # Decoding the target sequence using the encoder output
        dec_output = self.decode(tgt_embeded, enc_output, src_padding_mask=src_padding_mask,
                                 tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)

        # Final projection to output layer
        output = self.final_layer(dec_output)

        return output

    def encode(self, src_embeded, src_padding_mask=None):
        x = src_embeded
        for layer in self.encoder_layers:
            x = layer(x, src_padding_mask=src_padding_mask)
        return x

    def decode(self, tgt_embeded, enc_output, src_padding_mask=None, tgt_padding_mask=None, tgt_future_mask=None):
        x = tgt_embeded
        for layer in self.decoder_layers:
            x = layer(x, enc_output, src_padding_mask=src_padding_mask,
                      tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)
        return x


In [149]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout = 0.1, maxlen = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create a matrix to hold the positional encodings
        position = torch.arange(0, maxlen).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_size, 2) * (-math.log(10000.0) / emb_size))

        # Compute the positional encodings for each dimension of the embedding
        pe = torch.zeros(maxlen, emb_size)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Register the positional encodings as a buffer, so it won’t be updated during backprop
        self.register_buffer('pe', pe)

    def forward(self, token_embedding):
        # Add positional encoding to token embedding
        seq_len = token_embedding.size(1)
        token_embedding = token_embedding + self.pe[:seq_len, :].unsqueeze(0)
        return self.dropout(token_embedding)


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        # Embedding layer to map token IDs to embeddings
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        # Scale the embeddings by the square root of the embedding dimension (normalization)
        return self.embedding(tokens) * math.sqrt(self.emb_size)

In [150]:
def create_mask(src, tgt, pad_token_idx=0):
    src_seq_len, batch_size = src.size()  # (src_seq_len, batch_size)
    tgt_seq_len, _ = tgt.size()  # (tgt_seq_len, batch_size)

    # Source padding mask (for encoder and decoder cross-attention)
    src_padding_mask = (src != pad_token_idx).transpose(0, 1)  # (batch_size, src_seq_len)
    src_padding_mask = src_padding_mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, src_seq_len)

    # Target padding mask (for decoder self-attention)
    tgt_padding_mask = (tgt != pad_token_idx).transpose(0, 1)  # (batch_size, tgt_seq_len)
    tgt_padding_mask = tgt_padding_mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, tgt_seq_len)

    # Future mask (causal mask) for decoder self-attention
    tgt_future_mask = torch.tril(torch.ones((tgt_seq_len, tgt_seq_len), device=tgt.device)).bool()  # (tgt_seq_len, tgt_seq_len)
    tgt_future_mask = tgt_future_mask.unsqueeze(0).unsqueeze(1)  # (1, 1, tgt_seq_len, tgt_seq_len)

    return src_padding_mask, tgt_padding_mask, tgt_future_mask

In [151]:
class Seq2SeqNetwork(nn.Module):
    def __init__(self,
                 num_encoder_layers,
                 num_decoder_layers,
                 emb_size,
                 nhead,
                 src_vocab_size,
                 tgt_vocab_size,
                 dim_feedforward,
                 dropout = 0.05):
        super().__init__()
        self.transformer = Transformer(
            d_model=emb_size,
            num_heads=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            d_ff=dim_feedforward,
            dropout=dropout
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self,
                src,
                trg,
                tgt_future_mask=None,
                src_padding_mask=None,
                tgt_padding_mask=None):
        src = src.long()
        trg = trg.long()
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)
        return self.generator(outs)


    def encode(self, src, src_padding_mask=None):
        return self.transformer.encode(self.positional_encoding(self.src_tok_emb(src)), src_padding_mask=src_padding_mask)

    def decode(self, tgt, memory, src_padding_mask=None, tgt_padding_mask=None, tgt_future_mask=None):
        return self.transformer.decode(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)

## Note: The parameter size of model should be less than 100M (100,000k) !!!

In [152]:
EMB_SIZE = 128
NHEAD = 1
FFN_HID_DIM = 128
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1
SRC_VOCAB_SIZE = tokenizer_cn.vocab_size
TGT_VOCAB_SIZE = tokenizer_en.vocab_size
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transformer = Seq2SeqNetwork(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)
param_transformer = sum(p.numel() for p in transformer.parameters())
print (f"The parameter size of transformer is {param_transformer/1000} k")
#   The parameter size of model should be less than 100M (100,000k) !!!
#   The parameter size of model should be less than 100M (100,000k) !!!
#   The parameter size of model should be less than 100M (100,000k) !!!

The parameter size of transformer is 10438.34 k


# Training
- You can change the training setting by yourself including
  - Number of epoch
  - Optimizer
  - Learning rate
  - Learning rate scheduler
  - etc...

In [157]:
NUM_EPOCHS = 5
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.1, betas=(0.9, 0.98), eps=1e-9)

## Translation quality metrics: BLEU score

In [158]:
from torchmetrics.text import BLEUScore

def bleu_score_func(predicted, truth, grams=1):
    preds = [predicted]
    truth = [[truth]]
    bleu = BLEUScore(n_gram=grams)
    return bleu(preds, truth)


def BLEU_batch(predict, truth, output_tokenizer):
    batch_size = predict.size(1)
    total_score = 0
    for i in range(batch_size):
        predict_str = output_tokenizer.decode(predict[:, i], skip_special_tokens=True)
        truth_str = output_tokenizer.decode(truth[:, i], skip_special_tokens=True)
        score_gram1 = bleu_score_func(predict_str.lower(), truth_str.lower(), grams=1)
        #score_gram2 = bleu_score_func(predict_str.lower(), truth_str, grams=2)
        #score_gram3 = bleu_score_func(predict_str.lower(), truth_str, grams=3)
        #score_gram4 = bleu_score_func(predict_str.lower(), truth_str, grams=4)
        #total_score = total_score + (score_gram1 + score_gram2 + score_gram3 + score_gram4) / 4.0
        total_score = total_score + score_gram1
    total_score = total_score / batch_size
    return total_score

## Training and Evaluation Functions

In [161]:
def train_epoch(model, optimizer, train_dataloader):
    model.train()
    losses = 0

    for src, tgt in train_dataloader:
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)

        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]
        src_padding_mask, tgt_padding_mask, tgt_future_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1).long())
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model, val_dataloader):
    model.eval()
    losses = 0
    score = 0

    for src, tgt in val_dataloader:
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)

        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]
        src_padding_mask, tgt_padding_mask, tgt_future_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_padding_mask=src_padding_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_future_mask)

        tgt_out = tgt[1:, :]
        _, tgt_predict = torch.max(logits, dim=-1)
        score_batch = BLEU_batch(tgt_predict, tgt_out, tokenizer_en)

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1).long())
        losses += loss.item()
        score += score_batch

    return (losses / len(list(val_dataloader))), (score / len(list(val_dataloader)))

## Start training
- MODEL_SAVE_PATH: path for storing the best model

In [25]:
MODEL_SAVE_PATH = "./model.ckpt"

In [162]:
from timeit import default_timer as timer
transformer = transformer.to(DEVICE)

best_acc = 0
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer, cn_to_en_train_loader)
    end_time = timer()
    val_loss, val_acc = evaluate(transformer, cn_to_en_test_loader)

    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, Val Acc: {val_acc:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


    # Save the best model so far.
    if val_acc > best_acc:
        best_acc = val_acc
        best_state_dict = transformer.state_dict()
        torch.save(best_state_dict, MODEL_SAVE_PATH)
        print("(model saved)")

Epoch: 1, Train loss: 5.125, Val loss: 5.929, Val Acc: 0.139, Epoch time = 66.158s
(model saved)


KeyboardInterrupt: 

In [83]:
a = torch.randn(64, 1, 1, 127).long()
b = torch.randn(64, 1, 127, 127).long()
ma = a & b
print(ma.shape[-1])

127


# Inference

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_padding_mask=src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_padding_mask = torch.ones(1, ys.size(0)).type(torch.bool).to(DEVICE)  # target padding mask
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)  # target causal mask
        out = model.decode(ys, memory, src_padding_mask=src_mask, tgt_padding_mask=tgt_padding_mask, tgt_future_mask=tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str, input_tokenizer, output_tokenizer):
    model.eval()
    sentence = input_tokenizer.encode(src_sentence)
    sentence = torch.tensor(sentence).view(-1, 1)
    num_tokens = sentence.shape[0]

    src_mask = torch.ones(1, num_tokens).type(torch.bool)  # source padding mask
    tgt_tokens = greedy_decode(model, sentence, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    output_sentence = output_tokenizer.decode(tgt_tokens, skip_special_tokens=True)
    return output_sentence

## Load best model

In [None]:
transformer = Seq2SeqNetwork(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer.to(DEVICE)
transformer.load_state_dict(torch.load("model.ckpt"))

## Translation testing

In [None]:
sentence = "你好，欢迎来到中国"
ground_truth = 'Hello, Welcome to China'
predicted = translate(transformer, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

In [None]:
sentence = "早上好，很高心见到你"
ground_truth = 'Good Morning, nice to meet you'
predicted = translate(transformer, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())

In [None]:
sentence = "祝您有个美好的一天"
ground_truth = 'Have a nice day'
predicted = translate(transformer, sentence, tokenizer_cn, tokenizer_en)

print(f'{"Input:":15s}: {sentence}')
print(f'{"Prediction":15s}: {predicted}')
print(f'{"Ground truth":15s}: {ground_truth}')
print("Bleu Score (1gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 1).item())
print("Bleu Score (2gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 2).item())
print("Bleu Score (3gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 3).item())
print("Bleu Score (4gram): ", bleu_score_func(predicted.lower(), ground_truth.lower(), 4).item())