In [1]:
import tensorflow
import numpy
import matplotlib

print(tensorflow.__version__)
print(numpy.__version__)
print(matplotlib.__version__)

2.6.0
1.21.4
3.4.3


In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

# 1. 데이터 로드 및 전처리

In [3]:
data_dir = os.getenv('HOME')+'/aiffel/transformer/data'
kor_path = data_dir+"/korean-english-park.train.ko"
eng_path = data_dir+"/korean-english-park.train.en"

# 데이터 정제 및 토큰화
def clean_corpus(kor_path, eng_path):
    with open(kor_path, "r") as f: kor = f.read().splitlines()
    with open(eng_path, "r") as f: eng = f.read().splitlines()
    assert len(kor) == len(eng)

    # [[YOUR CODE]]
    cleaned_corpus = set(zip(kor, eng))
    cleaned_corpus = [kor+"\t"+eng for kor, eng in cleaned_corpus]
    
    return cleaned_corpus

cleaned_corpus = clean_corpus(kor_path, eng_path)

In [4]:
def preprocess_sentence(sentence):   
    sentence = sentence.lower()
    sentence = re.sub(r"[^a-zA-Z0-9가-힣?.!,]+", " ", sentence)
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    
    return sentence

In [5]:
import sentencepiece as spm

# Sentencepiece를 활용하여 학습한 tokenizer를 생성합니다.
def generate_tokenizer(corpus,
                        vocab_size,
                        lang="ko",
                        pad_id=0,
                        bos_id=1,
                        eos_id=2,
                        unk_id=3):
    # [[YOUR CODE]]
    temp_file = f"{lang}_corpus.txt"
    with open(temp_file, 'w') as f:
        for row in corpus:
            f.write(str(row) + '\n')
    
    spm.SentencePieceTrainer.train(
        input = temp_file,
        model_prefix = f"{lang}_corpus",
        vocab_size = vocab_size,
        pad_id = pad_id,
        bos_id = bos_id,
        eos_id = eos_id,
        unk_id = unk_id,
        model_type = "unigram"
    )
    
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load(f"{lang}_corpus.model")
    
    return tokenizer
    

SRC_VOCAB_SIZE = TGT_VOCAB_SIZE = 20000

eng_corpus = []
kor_corpus = []

for pair in cleaned_corpus:
    k, e = pair.split("\t")

    kor_corpus.append(preprocess_sentence(k))
    eng_corpus.append(preprocess_sentence(e))

ko_tokenizer = generate_tokenizer(kor_corpus, SRC_VOCAB_SIZE, "ko")
en_tokenizer = generate_tokenizer(eng_corpus, TGT_VOCAB_SIZE, "en")
en_tokenizer.set_encode_extra_options("bos:eos")

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ko_corpus.txt
  input_format: 
  model_prefix: ko_corpus
  model_type: UNIGRAM
  vocab_size: 20000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape

True

gram_model_trainer.cc(143) LOG(INFO) Extracting frequent sub strings...
unigram_model_trainer.cc(194) LOG(INFO) Initialized 83783 seed sentencepieces
trainer_interface.cc(526) LOG(INFO) Tokenizing input sentences with whitespace: 78957
trainer_interface.cc(537) LOG(INFO) Done! 46275
unigram_model_trainer.cc(489) LOG(INFO) Using 46275 sentences for EM training
unigram_model_trainer.cc(505) LOG(INFO) EM sub_iter=0 size=35194 obj=9.97208 num_tokens=87100 num_tokens/piece=2.47485
unigram_model_trainer.cc(505) LOG(INFO) EM sub_iter=1 size=26446 obj=8.1277 num_tokens=87543 num_tokens/piece=3.31025
unigram_model_trainer.cc(505) LOG(INFO) EM sub_iter=0 size=21986 obj=8.04698 num_tokens=88961 num_tokens/piece=4.04626
unigram_model_trainer.cc(505) LOG(INFO) EM sub_iter=1 size=21887 obj=8.02745 num_tokens=89134 num_tokens/piece=4.07246
trainer_interface.cc(615) LOG(INFO) Saving model: en_corpus.model
trainer_interface.cc(626) LOG(INFO) Saving vocabs: en_corpus.vocab


In [6]:
from tqdm import tqdm    # Process 과정을 보기 위해
import tensorflow as tf

src_corpus = []
tgt_corpus = []

assert len(kor_corpus) == len(eng_corpus)

# 토큰의 길이가 50 이하인 문장만 남깁니다. 
for idx in tqdm(range(len(kor_corpus)), desc = "Processing data"):
    # [[YOUR CODE]]
    src_tokenized = ko_tokenizer.encode_as_ids(kor_corpus[idx])
    tgt_tokenized = en_tokenizer.encode_as_ids(eng_corpus[idx])
    
    if len(src_tokenized) <= 50 and len(tgt_tokenized) <= 50:
        src_corpus.append(src_tokenized)
        tgt_corpus.append(tgt_tokenized)
    

# 패딩처리를 완료하여 학습용 데이터를 완성합니다. 
enc_train = tf.keras.preprocessing.sequence.pad_sequences(src_corpus, padding='post')
dec_train = tf.keras.preprocessing.sequence.pad_sequences(tgt_corpus, padding='post')

print(len(src_corpus), len(tgt_corpus))

Processing data: 100%|██████████| 78968/78968 [00:04<00:00, 16703.45it/s]


71288 71288


In [7]:
enc_train.shape

(71288, 50)

# 2. 모델 설계 

In [8]:
# positional encoding
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)
    
    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]
    
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    
    return sinusoid_table

In [9]:
# MultiHeadAttention

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = self.d_model // self.num_heads
        
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        self.linear = tf.keras.layers.Dense(d_model)
    
    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        
        #ScaledQK
        qk = tf.matmul(Q, K, transpose_b = True)
        scaled_qk = qk / tf.math.sqrt(d_k)
        
        #print("scaled_qk Shape:", scaled_qk.shape)
        #print("mask shape: ", mask.shape)
        
        if mask is not None: scaled_qk += (mask * -1e9)
        
        attentions = tf.nn.softmax(scaled_qk, axis = -1)
        out = tf.matmul(attentions, V)
        
        return out, attentions
    
    def split_heads(self, x):
        # Embedding을 Head 수로 분할
        # x: [batch x length x emb]
        # return: [batch x heads x length x self.depth]
        
        batch_size = x.shape[0]
        split_x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm = [0, 2, 1, 3])
        #print(split_x.shape, "shape of X after split_heads")
        
        return split_x
    
    def combine_heads(self, x):
        # split된 embedding을 하나로 결합
        # x: [batch x heads x length x depth]
        # return: [batch x length x emb]
        batch_size = x.shape[0]
        #print(x.shape, "Shape of X before combine_heads")
        combined_x = tf.transpose(x, perm = [0, 2, 1, 3]) # batch x length x heads x depth
        #print(combined_x.shape, "Shape of X after transpose before reshape")
        combined_x = tf.reshape(combined_x, (batch_size, -1, self.d_model))
        #print(combined_x.shape, "Shape of X after reshape")
        
        return combined_x
    
    def call(self, Q, K, V, mask):
        '''
        Step 1: Linear_in(Q, K, V) -> WQ, WK, WV
        Step 2: Split Heads(WQ, WK, WV) -> WQ_split, WK_split, WV_split
        Step 3: Scaled Dot Product Attention(WQ_split, WK_split, WV_split)
                 -> out, attention_weights
        Step 4: Combine Heads(out) -> out
        Step 5: Linear_out(out) -> out
        '''
        # Step 1
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        #print("Step 1 in Mutli-head attention completed")
        
        # Step 2
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        #print("Step 2 in Mutli-head attention completed")

        
        # Step 3
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
        #print("Step 3 in Mutli-head attention completed")
        
        # Step 4
        out = self.combine_heads(out)
        #print("Step 4 in Mutli-head attention completed")
        
        # Step 5
        out = self.linear(out)
        #print("Step 5 in Mutli-head attention completed")
        
        return out, attention_weights
        

In [10]:
# Positionwise FeedForward

class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.w_1 = tf.keras.layers.Dense(d_ff, activation = "relu")
        self.w_2 = tf.keras.layers.Dense(d_model)
        
    def call(self, x):
        out = self.w_1(x)
        out = self.w_2(out)
        
        return out

In [11]:
# EncoderLayer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        
        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        
        self.dropout = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, mask):
        # Multi-head Attention
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.dropout(out)
        out += residual
        
        # Position-wise FeedForward Network
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual
        
        return out, enc_attn

In [12]:
# DecoderLayer
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        self.dec_self_attn = MultiHeadAttention(d_model, n_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, n_heads)
        
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)        
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon = 1e-6)
        
        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, enc_out, causality_mask, padding_mask):
        # Maksed Multi-head Attention
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, causality_mask)
        out = self.dropout(out)
        out += residual
        
        # Encoder-Decoder Multi-head Attention
        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.enc_dec_attn(out, enc_out, enc_out, padding_mask)
        out = self.dropout(out)
        out += residual
        
        # Position-wise FeedForward Netword
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual
        
        return out, dec_attn, dec_enc_attn

In [13]:
class Encoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)]
    
    def call(self, x, mask):
        out = x
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns
        

In [14]:
class Decoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(self.n_layers)]
        
    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x
        
        dec_attns = list()
        dec_enc_attns = list()
        
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = self.dec_layers[i](out, enc_out, causality_mask, padding_mask)
            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)
        
        return out, dec_attns, dec_enc_attns

In [15]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout = 0.2,
                    shared = True):
        super(Transformer, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        
        self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)
        
        self.pos_encoding = positional_encoding(pos_len, d_model)
        
        self.dropout = tf.keras.layers.Dropout(dropout)
        
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)
        
        self.fc = tf.keras.layers.Dense(tgt_vocab_size)
        
        # decoder embedding 층과 출력층의 weight sharing
        self.shared = shared
        if shared: self.fc.set_weights(tf.transpose(self.dec_emb.weights))
            
    def embedding(self, emb, x):
        seq_len = x.shape[1] # x: batch x seq_length
        out = emb(x) # batch_size x seq_length x d_model
        
        if self.shared: out *= tf.math.sqrt(self.d_model)
        
        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        # self.pos_encoding[np.newaxis, ...][:, :seq_len, :].shape = [1, seq_len, d_model]
        
        out = self.dropout(out) # batch_size x seq_len x d_model
        
        return out
    
    def call(self, enc_in, dec_in, enc_mask, causality_mask, padding_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)
        
        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        dec_out, dec_attns, dec_enc_attns = self.decoder(dec_in, enc_out, causality_mask, padding_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns
    
        

In [16]:
# mask 함수 정의하기
# 주석 처리 코드 -> LMS 원본 코드
# train_step은 돌아가는데 evaluate 할 때 scaled_qk와 shape 에러 발생함..

def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

# def generate_causality_mask(size):
#     mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
#     return mask

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

# def generate_masks(src, tgt):
#     enc_mask = generate_padding_mask(src)
#     dec_mask = generate_causality_mask(tgt.shape[1])
#     dec_enc_mask = generate_padding_mask(tgt)
    
#     return enc_mask, dec_mask, dec_enc_mask

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)
    
    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)
    
    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)
    
    return enc_mask, dec_mask, dec_enc_mask

# 3. 훈련하기 

In [17]:
# 하이퍼파라미터값 조정
n_layers = 2
d_model = 512
n_heads = 8
d_ff = 2048
max_len = 50
dropout = 0.2

In [18]:
transformer = Transformer(
    n_layers = n_layers,
    d_model = d_model,
    n_heads = n_heads,
    d_ff = d_ff,
    src_vocab_size = SRC_VOCAB_SIZE,
    tgt_vocab_size = TGT_VOCAB_SIZE,
    pos_len = max_len,
    dropout = 0.2,
    shared = True
)

In [19]:
# Learning Rate Scheduler & Optimizer
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps = 4000):
        super(LearningRateScheduler, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

    
learning_rate = LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1 = 0.9, beta_2 = 0.98, epsilon = 1e-9)

In [20]:
# Loss Function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits = True, reduction = 'none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype = loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [21]:
# Train Step 함수 정의

@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:] # bos token 제외하고 ~
    
    enc_mask, dec_mask, dec_enc_mask = generate_masks(src, tgt)
    
    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt, enc_mask, dec_mask, dec_enc_mask)
        loss = loss_function(gold, predictions[:, :-1])
        
    variables = model.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss, enc_attns, dec_attns, dec_enc_attns
    

In [22]:
# 번역 생성 함수

def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocess_sentence(sentence)

    pieces = src_tokenizer.encode_as_pieces(sentence)
    tokens = src_tokenizer.encode_as_ids(sentence)

    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=enc_train.shape[-1],
                                                           padding='post')
    
    ids = []
    output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, dec_causality_mask, dec_padding_mask = \
        generate_masks(_input, output)

        predictions, enc_attns, dec_attns, dec_enc_attns =\
        model(_input, 
              output,
              enc_padding_mask,
              dec_causality_mask,
              dec_padding_mask)
        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if tgt_tokenizer.eos_id() == predicted_id:
            result = tgt_tokenizer.decode_ids(ids)
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    result = tgt_tokenizer.decode_ids(ids)

    return pieces, result, enc_attns, dec_attns, dec_enc_attns

In [23]:
# Attention 시각화 함수

def visualize_attention(src, tgt, enc_attns, dec_attns, dec_enc_attns):
    def draw(data, ax, x="auto", y="auto"):
        import seaborn
        seaborn.heatmap(data, 
                        square=True,
                        vmin=0.0, vmax=1.0, 
                        cbar=False, ax=ax,
                        xticklabels=x,
                        yticklabels=y)
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Encoder Layer", layer + 1)
        for h in range(4):
            draw(enc_attns[layer][0, h, :len(src), :len(src)], axs[h], src, src)
        plt.show()
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            draw(dec_attns[layer][0, h, :len(tgt), :len(tgt)], axs[h], tgt, tgt)
        plt.show()

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        for h in range(4):
            draw(dec_enc_attns[layer][0, h, :len(tgt), :len(src)], axs[h], src, tgt)
        plt.show()

In [24]:
# 번역 생성 및 Attention 시각화 결합

def translate(sentence, model, src_tokenizer, tgt_tokenizer, plot_attention=False):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    if plot_attention:
        visualize_attention(pieces, result.split(), enc_attns, dec_attns, dec_enc_attns)

In [25]:
sample_text= [
    '오바마는 대통령이다.',
    '시민들은 도시 속에 산다.',
    '커피는 필요 없다.',
    '일곱 명의 사망자가 발생했다.'
]

In [26]:
from tqdm import tqdm
import random

EPOCHS = 20
BATCH_SIZE = 64

for epoch in range(EPOCHS):
    
    total_loss = 0
    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)
    
    for (batch, idx) in enumerate(t):
        batch_loss, _, _, _ = train_step(enc_train[idx: idx + BATCH_SIZE],
                                dec_train[idx: idx + BATCH_SIZE],
                                transformer,
                                optimizer,)
        total_loss += batch_loss
        
        t.set_description_str("EPOCH %2d" % (epoch + 1))
        t.set_postfix_str('LOSS %.4f' % (total_loss.numpy() / (batch + 1)))
    
    print("Translations")   
    for idx, text in enumerate(sample_text):
        
        pieces, result, enc_attns, dec_attns, dec_enc_attns = \
        evaluate(text, transformer, ko_tokenizer, en_tokenizer)
        
        print(f"> input : {idx+1}. {text}")
        print(f"> output: {idx+1}. {result}")
        
    print("\nHyperparameters")
    print("n_layers: ", n_layers)
    print("d_model: ", d_model)
    print("n_heads: ", n_heads)
    print("d_ff: ", d_ff)
    print("dropout: ", dropout)
        
    print("\nTraining Parameters")
    print("Warmup Steps: ", learning_rate.warmup_steps)
    print("Batch Size: ", BATCH_SIZE)
    print("Epoch At: ", epoch+1)
    print("-------------------------------------------")
    
    

EPOCH  1: 100%|██████████| 1114/1114 [03:37<00:00,  5.12it/s, LOSS 5.9608]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is the president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. the blasts were not to the deaths .
> input : 3. 커피는 필요 없다.
> output: 3. the new york is not to be a lot of the new york .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the blasts were killed .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  1
-------------------------------------------


EPOCH  2: 100%|██████████| 1114/1114 [03:31<00:00,  5.26it/s, LOSS 4.4040]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. the city city is the city of urban city .
> input : 3. 커피는 필요 없다.
> output: 3. if you don t have any excuse .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the deadly deadly deadly .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  2
-------------------------------------------


EPOCH  3: 100%|██████████| 1114/1114 [03:31<00:00,  5.27it/s, LOSS 3.7878]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a lot of president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. the people are the living .
> input : 3. 커피는 필요 없다.
> output: 3. there is no sign of no .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. a tornado hit the dead , killing at least one person died .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  3
-------------------------------------------


EPOCH  4: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 3.3773]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is the president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. the citizens are everywhere .
> input : 3. 커피는 필요 없다.
> output: 3. coffee needs , which are not a coffee .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the death toll was killed by a local toll .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  4
-------------------------------------------


EPOCH  5: 100%|██████████| 1114/1114 [03:30<00:00,  5.29it/s, LOSS 2.8294]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are huges .
> input : 3. 커피는 필요 없다.
> output: 3. no need for the need .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seventh died .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  5
-------------------------------------------


EPOCH  6: 100%|██████████| 1114/1114 [03:30<00:00,  5.28it/s, LOSS 2.2110]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. he is a president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are looking for their urban .
> input : 3. 커피는 필요 없다.
> output: 3. no need for .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. seven people were killed in the event .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  6
-------------------------------------------


EPOCH  7: 100%|██████████| 1114/1114 [03:30<00:00,  5.28it/s, LOSS 1.6656]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a state . . . . . . .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are proud of their city .
> input : 3. 커피는 필요 없다.
> output: 3. the need need for . . . needs .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven crew killed seven people .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  7
-------------------------------------------


EPOCH  8: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 1.2190]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a real person .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. they re just caught in the city .
> input : 3. 커피는 필요 없다.
> output: 3. no need for faa need .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. seven people are watching television .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  8
-------------------------------------------


EPOCH  9: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 0.8825]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are drawn into the city .
> input : 3. 커피는 필요 없다.
> output: 3. the protect youed explorer .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. seven people died in the hospital .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  9
-------------------------------------------


EPOCH 10: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 0.6509]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a matter of u . s .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are busy considers destroying urban violent .
> input : 3. 커피는 필요 없다.
> output: 3. the need needed to make a fight .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven death force is making the remainder .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  10
-------------------------------------------


EPOCH 11: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 0.5036]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a test .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are located about our city .
> input : 3. 커피는 필요 없다.
> output: 3. we need need to need need , or a doctor neededal needs .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven was among the dead .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  11
-------------------------------------------


EPOCH 12: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 0.4066]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a real man .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are located .
> input : 3. 커피는 필요 없다.
> output: 3. the need for religious democracy .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven death toll is in the hospital .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  12
-------------------------------------------


EPOCH 13: 100%|██████████| 1114/1114 [03:30<00:00,  5.28it/s, LOSS 0.3394]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a test .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. they detained on the city .
> input : 3. 커피는 필요 없다.
> output: 3. the need needed to protect you .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. seven people died in the violence .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  13
-------------------------------------------


EPOCH 14: 100%|██████████| 1114/1114 [03:30<00:00,  5.28it/s, LOSS 0.2901]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a response for president bush .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. they are showing aware of anti cities .
> input : 3. 커피는 필요 없다.
> output: 3. the need for you , did not need to speak to your average .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seventh death toll is reported .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  14
-------------------------------------------


EPOCH 15: 100%|██████████| 1114/1114 [03:31<00:00,  5.28it/s, LOSS 0.2508]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a real .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are showing the life race .
> input : 3. 커피는 필요 없다.
> output: 3. so we need to go with .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. seven people died in the blast .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  15
-------------------------------------------


EPOCH 16: 100%|██████████| 1114/1114 [03:30<00:00,  5.28it/s, LOSS 0.2207]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. the president elect is a president .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. they want to name town town .
> input : 3. 커피는 필요 없다.
> output: 3. coffee needs .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven died of injuries .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  16
-------------------------------------------


EPOCH 17: 100%|██████████| 1114/1114 [03:30<00:00,  5.28it/s, LOSS 0.1971]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a response from the united states .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are free to visitors .
> input : 3. 커피는 필요 없다.
> output: 3. no need for . . . . . . . . .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seventh died in that number .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  17
-------------------------------------------


EPOCH 18: 100%|██████████| 1114/1114 [03:31<00:00,  5.27it/s, LOSS 0.1762]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. the president is real .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. they want to protect the city .
> input : 3. 커피는 필요 없다.
> output: 3. no arrests or needed .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven people were killed .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  18
-------------------------------------------


EPOCH 19: 100%|██████████| 1114/1114 [03:31<00:00,  5.27it/s, LOSS 0.1584]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a real man .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. some citizens are caught in the city .
> input : 3. 커피는 필요 없다.
> output: 3. no need for coffee .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven people are watching the deaths .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  19
-------------------------------------------


EPOCH 20: 100%|██████████| 1114/1114 [03:31<00:00,  5.27it/s, LOSS 0.1444]


Translations
> input : 1. 오바마는 대통령이다.
> output: 1. obama is a real guy .
> input : 2. 시민들은 도시 속에 산다.
> output: 2. citizens are showing 20 people located in the city .
> input : 3. 커피는 필요 없다.
> output: 3. the need forness is needed .
> input : 4. 일곱 명의 사망자가 발생했다.
> output: 4. the seven people died in the attack .

Hyperparameters
n_layers:  2
d_model:  512
n_heads:  8
d_ff:  2048
dropout:  0.2

Training Parameters
Warmup Steps:  4000
Batch Size:  64
Epoch At:  20
-------------------------------------------


# 회고 

- 학습 속도가 1 epoch당 3분 30초 걸렸으며, 이전 S2S 모델이 약 10분정도 걸렸던 것에 비하면 속도는 훨씬 빠른 것 같음
- Loss값도 5.9608 -> 0.1444 로 안정적으로 떨어진 모습이 나타남
- 하지만 모델의 성능 자체는 epoch이 늘어남에 따라 비례해서 올라가는 것은 아닌 것 같고, 변동성있는 모습이 보임
- Generate Mask 쪽이 원본 코드와 변경되었는데, 추가적인 이해가 필요할 것 같다! 