# 트랜스포머를 이용한 한-영 번역 모델

In [60]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

In [11]:
pd.set_option('display.max_rows', None)

## 데이터 전처리

In [12]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     |████████████████████████████████| 249 kB 6.0 MB/s            
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


LMS 노드의 데이터 대신 AI Hub의 한국어-영어 번역(병렬) 말뭉치(https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=data&dataSetSn=126)를 사용한다.

In [13]:
dfdict = dict()

for filename in os.listdir('./'):
    if filename.endswith('.xlsx'):
        print(filename)
        dfdict[filename] = pd.read_excel(filename)

1_구어체(1).xlsx
4_문어체_한국문화.xlsx


  warn("Workbook contains no default style, apply openpyxl's default")


3_문어체_뉴스(3).xlsx
2_대화체.xlsx
3_문어체_뉴스(2).xlsx
3_문어체_뉴스(1)_200226.xlsx
6_문어체_지자체웹사이트.xlsx
5_문어체_조례.xlsx
3_문어체_뉴스(4).xlsx
1_구어체(2).xlsx


In [14]:
for df in dfdict.values():
    print(df.columns)

Index(['SID', '원문', '번역문'], dtype='object')
Index(['ID', '키워드', '원문', '번역문'], dtype='object')
Index(['ID', '날짜', '자동분류1', '자동분류2', '자동분류3', 'URL', '언론사', '원문', '번역문'], dtype='object')
Index(['대분류', '소분류', '상황', 'Set Nr.', '발화자', '원문', '번역문'], dtype='object')
Index(['ID', '날짜', '자동분류1', '자동분류2', '자동분류3', 'URL', '언론사', '원문', '번역문'], dtype='object')
Index(['ID', '날짜', '자동분류1', '자동분류2', '자동분류3', 'URL', '언론사', '원문', '번역문'], dtype='object')
Index(['ID', '지자체', '원문', '번역문'], dtype='object')
Index(['ID', '지자체', '원문', '번역문'], dtype='object')
Index(['ID', '날짜', '자동분류1', '자동분류2', '자동분류3', 'URL', '언론사', '원문', '번역문'], dtype='object')
Index(['SID', '원문', '번역문'], dtype='object')


In [15]:
df_total = pd.concat(list(dfdict.values()))[['원문', '번역문']]
df_total.reset_index(drop=True, inplace=True)
df_total.head()

Unnamed: 0,원문,번역문
0,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 ...,Bible Coloring' is a coloring application that...
1,씨티은행에서 일하세요?,Do you work at a City bank?
2,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
3,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
4,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...


In [16]:
df_total.iloc[:1000000].to_excel('corpus_01.xlsx')
df_total.iloc[1000000:].to_excel('corpus_02.xlsx')

In [19]:
df_total.rename(columns={'원문':'kor', '번역문':'eng'}, inplace=True)
df_total.head()

Unnamed: 0,kor,eng
0,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 ...,Bible Coloring' is a coloring application that...
1,씨티은행에서 일하세요?,Do you work at a City bank?
2,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
3,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
4,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...


한국어 문장 기준으로 중복치를 제거한다. 영어 문장은 서로 다른 한국어 문장이 같은 영어 문장으로 번역될 수 있음을 고려하여 중복치를 남겨둔다.

In [21]:
# 중복치 확인 및 제거
dup_kor = df_total[df_total.duplicated(['kor'], keep=False)].sort_values('kor')
dup_kor

Unnamed: 0,kor,eng
298371,"1,000여 명의 시민들의 참석한 가운데 초청 가수 공연을 비롯하여 민족춤패의 쟁강...","In the presence of more than 1,000 citizens, t..."
793054,"1,000여 명의 시민들의 참석한 가운데 초청 가수 공연을 비롯하여 민족춤패의 쟁강...","In the presence of more than 1,000 citizens, t..."
235125,100여 종의 국내외 다양한 연들이 다대포 앞바다를 수놓아 장관을 이루었다.,Over 100 kinds of kites from various countries...
654816,100여 종의 국내외 다양한 연들이 다대포 앞바다를 수놓아 장관을 이루었다.,Over 100 kinds of kites from various countries...
227292,10리 벚꽃으로 널리 알려진 하동군 화개의 꽃길은 사랑하는 청춘 남녀가 두 손을 꼭...,"The flower road of Hwagae in Hadong-gun, widel..."
776257,10리 벚꽃으로 널리 알려진 하동군 화개의 꽃길은 사랑하는 청춘 남녀가 두 손을 꼭...,"The flower road of Hwagae in Hadong-gun, widel..."
235149,"10월 26일에는 인기 한류 스타가 참여하는 K-POP 콘서트를 개최하였고, 27일...","On October 26, K-POP concerts were held with p..."
762683,"10월 26일에는 인기 한류 스타가 참여하는 K-POP 콘서트를 개최하였고, 27일...","On October 26, K-POP concerts were held with p..."
747704,"10월 중 2일 간 개최되며 댕이골 내 30개 음식점이 참여하여 17,500여 명의...","The festival was held for two days in October,..."
297051,"10월 중 2일 간 개최되며 댕이골 내 30개 음식점이 참여하여 17,500여 명의...","The festival was held for two days in October,..."


In [22]:
df_total.drop_duplicates(subset=['kor'], keep='first', inplace=True, ignore_index=True)

In [23]:
len(df_total)

1599568

In [18]:
# 텍스트 정제

import re

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Zㄱ-ㅎ가-힣0-9?.!,]+", " ", sentence)

    sentence = sentence.strip()
    
    return sentence

In [26]:
df_prep = pd.DataFrame()
df_prep['kor'] = df_total.apply(lambda row: preprocess_sentence(row['kor']), axis=1)
df_prep['eng'] = df_total.apply(lambda row: preprocess_sentence(row['eng']), axis=1)
df_prep.head()

Unnamed: 0,kor,eng
0,bible coloring 은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 앱...,bible coloring is a coloring application that ...
1,씨티은행에서 일하세요 ?,do you work at a city bank ?
2,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다 .,"purito s bestseller , which recorded 4th rough..."
3,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다 .,in chapter 11 jesus called lazarus from the to...
4,"6 . 5 , 7 , 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠...",i would feel grateful to know how many stocks ...


In [28]:
df_prep.iloc[:1000000].to_excel('corpus_preprocessed_01.xlsx')
df_prep.iloc[1000000:].to_excel('corpus_preprocessed_02.xlsx')

In [29]:
# Sentencepiece를 활용하여 학습한 tokenizer를 생성합니다.
import sentencepiece as spm

def generate_tokenizer(corpus, vocab_size, lang="ko",
                       pad_id=0, bos_id=1, eos_id=2, unk_id=3):
    # 말뭉치를 텍스트 파일로 저장합니다.
    temp_file = f'{lang}_corpus.txt'
    with open(temp_file, 'w', encoding='utf-8') as f:
        for line in corpus:
            f.write(f'{line}\n')

    # SentencePiece 모델을 학습하는 데 사용되는 매개변수를 설정합니다.
    spm_args = f"--input={temp_file} --model_prefix={lang}_spm " \
               f"--vocab_size={vocab_size} --pad_id={pad_id} " \
               f"--bos_id={bos_id} --eos_id={eos_id} --unk_id={unk_id} " \
               f"--user_defined_symbols=<SEP>,<CLS>,<MASK> --model_type=bpe"

    # SentencePiece 모델을 학습합니다.
    spm.SentencePieceTrainer.Train(spm_args)

    # 학습된 모델을 로드합니다.
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(f"{lang}_spm.model")

    # 임시 파일을 삭제합니다.
    os.remove(temp_file)

    return tokenizer
    


In [31]:
SRC_VOCAB_SIZE = TGT_VOCAB_SIZE = 20000

eng_corpus = df_prep['eng'].to_list()
kor_corpus = df_prep['kor'].to_list()

ko_tokenizer = generate_tokenizer(kor_corpus, SRC_VOCAB_SIZE, "ko")
en_tokenizer = generate_tokenizer(eng_corpus, TGT_VOCAB_SIZE, "en")
en_tokenizer.set_encode_extra_options("bos:eos")

True

In [35]:
ko_tokenizer.encode_as_pieces('작은아버지가 방에 들어가신다.')

['▁작은', '아버', '지가', '▁방에', '▁들어가', '신', '다', '.']

In [44]:
ko_tokenizer.encode_as_ids('작은아버지가 방에 들어가신다.')

[1719, 6209, 641, 12990, 2538, 18821, 18750, 18751]

In [46]:
from tqdm import tqdm    # Process 과정을 보기 위해

src_corpus = []
tgt_corpus = []

assert len(kor_corpus) == len(eng_corpus)

# 토큰의 길이가 50 이하인 문장만 남깁니다. 
for idx in tqdm(range(len(kor_corpus))):
    kor_tokens = ko_tokenizer.encode_as_ids(kor_corpus[idx])
    eng_tokens = en_tokenizer.encode_as_ids(eng_corpus[idx])
    if (len(kor_tokens) <= 50) and (len(eng_tokens) <= 50): 
        src_corpus.append(kor_tokens)
        tgt_corpus.append(eng_tokens)  



100%|██████████| 1599568/1599568 [04:36<00:00, 5780.21it/s] 


In [47]:
# 패딩처리를 완료하여 학습용 데이터를 완성합니다. 
enc_train = tf.keras.preprocessing.sequence.pad_sequences(src_corpus, padding='post')
dec_train = tf.keras.preprocessing.sequence.pad_sequences(tgt_corpus, padding='post')

## 모델 정의

In [48]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return sinusoid_table


In [49]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
            
        self.depth = d_model // self.num_heads
            
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
            
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions
            

    def split_heads(self, x):
        batch_size = x.shape[0]
        split_x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        batch_size = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (batch_size, -1, self.d_model))

        return combined_x

        
    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
            
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
    				        
        out = self.combine_heads(out)
        out = self.linear(out)
                
        return out, attention_weights

In [50]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.w_1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.w_2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.w_1(x)
        out = self.w_2(out)
            
        return out

In [51]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):

        """
        Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.dropout(out)
        out += residual
        
        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual
        
        return out, enc_attn

In [52]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, causality_mask, padding_mask):

        """
        Masked Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.dropout(out)
        out += residual

        """
        Multi-Head Attention
        """
        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.enc_dec_attn(out, enc_out, enc_out, causality_mask)
        out = self.dropout(out)
        out += residual
        
        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual

        return out, dec_attn, dec_enc_attn

In [53]:
class Encoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]
        
    def call(self, x, mask):
        out = x
    
        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns


In [54]:
class Decoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 n_heads,
                 d_ff,
                 dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]
                            
                            
    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x
    
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [55]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared=True):
        super(Transformer, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)

        self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.dropout = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared = shared

        if shared: self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]
        out = emb(x)

        if self.shared: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.dropout(out)

        return out

        
    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        
        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, causality_mask, dec_mask)
        
        logits = self.fc(dec_out)
        
        return logits, enc_attns, dec_attns, dec_enc_attns

In [56]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [57]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

learning_rate = LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98, 
                                     epsilon=1e-9)

print("슝=3")

슝=3


## 모델 학습

In [75]:
n_layers=2
d_model=256
n_heads=8
d_ff=512
pos_len=50

transformer = Transformer(n_layers=n_layers,
                         d_model=d_model,
                         n_heads=n_heads,
                         d_ff=d_ff,
                         src_vocab_size=SRC_VOCAB_SIZE,
                         tgt_vocab_size=TGT_VOCAB_SIZE,
                         pos_len=pos_len)

In [76]:
learning_rate = LearningRateScheduler(d_model=d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.005,
    beta_1=0.9,
    beta_2=0.98,
    epsilon=1e-9
)

In [77]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    # Masking 되지 않은 입력의 개수로 Scaling하는 과정
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [78]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
        
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)

    # 계산된 loss에 tf.GradientTape()를 적용해 학습을 진행합니다.
    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])

    # 최종적으로 optimizer.apply_gradients()가 사용됩니다. 
    gradients = tape.gradient(loss, model.trainable_variables)
    
    # Apply the gradients to the optimizer
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss, enc_attns, dec_attns, dec_enc_attns

In [79]:
# Attention 시각화 함수

def visualize_attention(src, tgt, enc_attns, dec_attns, dec_enc_attns):
    def draw(data, ax, x="auto", y="auto"):
        import seaborn
        seaborn.heatmap(data, 
                        square=True,
                        vmin=0.0, vmax=1.0, 
                        cbar=False, ax=ax,
                        xticklabels=x,
                        yticklabels=y)
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Encoder Layer", layer + 1)
        for h in range(4):
            draw(enc_attns[layer][0, h, :len(src), :len(src)], axs[h], src, src)
        plt.show()
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            draw(dec_attns[layer][0, h, :len(tgt), :len(tgt)], axs[h], tgt, tgt)
        plt.show()

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        for h in range(4):
            draw(dec_enc_attns[layer][0, h, :len(tgt), :len(src)], axs[h], src, tgt)
        plt.show()

In [80]:
# 번역 생성 함수

def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocess_sentence(sentence)

    pieces = src_tokenizer.encode_as_pieces(sentence)
    tokens = src_tokenizer.encode_as_ids(sentence)

    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=enc_train.shape[-1],
                                                           padding='post')
    
    ids = []
    output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(_input, output)

        predictions, enc_attns, dec_attns, dec_enc_attns =\
        model(_input, 
              output,
              enc_padding_mask,
              combined_mask,
              dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if tgt_tokenizer.eos_id() == predicted_id:
            result = tgt_tokenizer.decode_ids(ids)
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    result = tgt_tokenizer.decode_ids(ids)

    return pieces, result, enc_attns, dec_attns, dec_enc_attns

In [81]:
# 번역 생성 및 Attention 시각화 결합

def translate(sentence, model, src_tokenizer, tgt_tokenizer, plot_attention=False):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    if plot_attention:
        visualize_attention(pieces, result.split(), enc_attns, dec_attns, dec_enc_attns)

In [None]:
# 학습

import random
from tqdm import tqdm_notebook 

BATCH_SIZE = 64
EPOCHS = 10

examples = [
            "오바마는 대통령이다.",
            "시민들은 도시 속에 산다.",
            "커피는 필요 없다.",
            "일곱 명의 사망자가 발생했다."
]

for epoch in range(EPOCHS):
    total_loss = 0
    
    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm_notebook(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_train[idx:idx+BATCH_SIZE],
                    dec_train[idx:idx+BATCH_SIZE],
                    transformer,
                    optimizer)

        total_loss += batch_loss
        
        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

    for example in examples:
        translate(example, transformer, ko_tokenizer, en_tokenizer)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  t = tqdm_notebook(idx_list)


  0%|          | 0/22769 [00:00<?, ?it/s]

Input: 오바마는 대통령이다.
Predicted translation: president obama is not sure to have president obama .
Input: 시민들은 도시 속에 산다.
Predicted translation: happiness is the person who is not good .
Input: 커피는 필요 없다.
Predicted translation: the coffee is not not not easy .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the father of the father of the father of the father of his father , who died , was murdered .


  0%|          | 0/22769 [00:00<?, ?it/s]

Input: 오바마는 대통령이다.
Predicted translation: the head of the united states is the first time .
Input: 시민들은 도시 속에 산다.
Predicted translation: i hope you wish you wish you wish you wish you wish to come true .
Input: 커피는 필요 없다.
Predicted translation: i want to be a coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the time of death , the death of the death of the death of the death of the death of the death of king .


  0%|          | 0/22769 [00:00<?, ?it/s]

Input: 오바마는 대통령이다.
Predicted translation: the term property means a corporation in the head of the tong .
Input: 시민들은 도시 속에 산다.
Predicted translation: where residents are located in front of the city hall .
Input: 커피는 필요 없다.
Predicted translation: i can t use coffee .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the house is a student died .


  0%|          | 0/22769 [00:00<?, ?it/s]

Input: 오바마는 대통령이다.
Predicted translation: the proposal of the us , the us , the us , the us , the us , and the us , the us , the us , the us , and the us , who does not receive the proposal .
Input: 시민들은 도시 속에 산다.
Predicted translation: in 2008 , senior adviser kim gyeong burns behind behind behind the chapel cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup cup
Input: 커피는 필요 없다.
Predicted translation: this proposal was been to the world cup .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: this is the highest in the state policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy policy .


  0%|          | 0/22769 [00:00<?, ?it/s]

Input: 오바마는 대통령이다.
Predicted translation: according to the same system .
Input: 시민들은 도시 속에 산다.
Predicted translation: the cathedral is trapped in vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable vegetable
Input: 커피는 필요 없다.
Predicted translation: coffee decreases and coffee decreases .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: according to the same period .


  0%|          | 0/22769 [00:00<?, ?it/s]

Input: 오바마는 대통령이다.
Predicted translation: it is validation to stay here .
Input: 시민들은 도시 속에 산다.
Predicted translation: i m gladify your resorts .
Input: 커피는 필요 없다.
Predicted translation: spa props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props shaped props .
Input: 일곱 명의 사망자가 발생했다.
Predicted translation: the resorts emitted diesel cars are emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by emitted by


  0%|          | 0/22769 [00:00<?, ?it/s]

커널이 중단되었다.ㅠㅠ 일단 여기서 마무리...

## 회고
- 학습 노드에서 주어진 데이터의 품질이 좋지 않아서 다른 데이터로 대체했다. 기존 데이터의 품질을 확인하는 작업도 노트북에 추가할걸...
- 토크나이저 학습 단계에서 너무 많은 시간이 소요되고 그럼에도 학습이 이루어지지 않았는데, 작업 중단 후 다시 시도해보니 금방 완료되었다. 시간이 오래 걸리는 단계에서 이것이 원래 시간이 많이 드는 작업인지, 에러에 의한 것인지 확인할 방법이 없을까?
- 번역 품질은 학습이 전부 진행되지 않아서 확실히 이야기할 수는 없으나 그리 좋아 보이진 않는다. 같은 표현이 불필요하게 많이 반복되는 경우도 있고, 그럴 듯한 문장을 출력한 경우에도 원래 한국어 문장과 의미가 유사하지 않은 경우가 많다.