In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False

import numpy as np
import pandas as pd
import time
import re


file_path = 'C:/Users/Myeong/dding/data/kor-eng_AIHUB.csv'
data = pd.read_csv(file_path)

data.drop_duplicates(inplace=True)
data.reset_index(inplace=True, drop=True)

data = data.sample(100000)
data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,en,ko
0,I was hoping to buy some plastic furniture for...,플라스틱 가구를 도매로 사고 싶었습니다.
1,You eat yours. Let's not force each other.,"너는 너 거 먹어, 강요하지 말자, 서로."
2,>Be careful when you flip it over.,>넘기는 거 조심해.
3,"Well, we just really wanted a bigger dining an...","글쎄, 우리는 더 큰 식당과 거실 공간을 정말로 원했습니다."
4,How long has your company been doing business ...,당신의 회사는 얼마나 오랫동안 사업을 해왔나요?
...,...,...
99995,"I badly need the money, because I will use it ...","돈이 절실히 필요합니다, 그 돈이 사업에 쓰일 것이기 때문이다."
99996,"Yes, I would love to arrange a meeting with you.","네, 당신과의 회의를 잡고 싶습니다."
99997,> It's so touching.,> 너무 따뜻해 이런 거.
99998,The price ranges from 729 dollars to 850 dollars.,가격은 729달러에서 850달러까지 다양합니다.


In [2]:
# 1. 소문자화
# 2. 구두점과 단어 사이에 공백 추가
# 3. 공백 2개 이상 -> 공백 1개
# 4. <start>, <end> 토큰 추가
from konlpy.tag import Mecab
mecab = Mecab(dicpath = 'C:/Users/Myeong/anaconda3/envs/jin/etc/mecab-ko-dic')

def preprocessing(s, ko=False):
    # s = unidecode(s)
    s = s.lower().strip()
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^ㄱ-ㅎ가-힣ㅏ-ㅣa-zA-Z0-9?.,!¿]+", " ", s)

    # 한국어는 따로 형태소 분석을 진행해줌
    # 한국어를 target 으로 할 예정이기 때문에 
    if ko:
        s = mecab.morphs(s)
        s = ' '.join(s)
        s = '<start> ' + s + ' <end>'
    
    s = s.strip()

    return s


# 각 컬럼 전처리
data['en_pre'] = data['en'].apply(preprocessing)
data['ko_pre'] = data['ko'].apply(lambda x : preprocessing(x, ko=True))

data

Unnamed: 0,en,ko,en_pre,ko_pre
0,I was hoping to buy some plastic furniture for...,플라스틱 가구를 도매로 사고 싶었습니다.,i was hoping to buy some plastic furniture for...,<start> 플라스틱 가구 를 도매 로 사 고 싶 었 습니다 . <end>
1,You eat yours. Let's not force each other.,"너는 너 거 먹어, 강요하지 말자, 서로.",you eat yours . let s not force each other .,"<start> 너 는 너 거 먹 어 , 강요 하 지 말 자 , 서로 . <end>"
2,>Be careful when you flip it over.,>넘기는 거 조심해.,be careful when you flip it over .,<start> 넘기 는 거 조심 해 . <end>
3,"Well, we just really wanted a bigger dining an...","글쎄, 우리는 더 큰 식당과 거실 공간을 정말로 원했습니다.","well , we just really wanted a bigger dining a...","<start> 글쎄 , 우리 는 더 큰 식당 과 거실 공간 을 정말로 원했 습니다 ..."
4,How long has your company been doing business ...,당신의 회사는 얼마나 오랫동안 사업을 해왔나요?,how long has your company been doing business ...,<start> 당신 의 회사 는 얼마나 오랫동안 사업 을 해 왔 나요 ? <end>
...,...,...,...,...
99995,"I badly need the money, because I will use it ...","돈이 절실히 필요합니다, 그 돈이 사업에 쓰일 것이기 때문이다.","i badly need the money , because i will use it...","<start> 돈 이 절실히 필요 합니다 , 그 돈 이 사업 에 쓰일 것 이 기 때..."
99996,"Yes, I would love to arrange a meeting with you.","네, 당신과의 회의를 잡고 싶습니다.","yes , i would love to arrange a meeting with y...","<start> 네 , 당신 과 의 회의 를 잡 고 싶 습니다 . <end>"
99997,> It's so touching.,> 너무 따뜻해 이런 거.,it s so touching .,<start> 너무 따뜻 해 이런 거 . <end>
99998,The price ranges from 729 dollars to 850 dollars.,가격은 729달러에서 850달러까지 다양합니다.,the price ranges from 729 dollars to 850 dolla...,<start> 가격 은 729 달러 에서 850 달러 까지 다양 합니다 . <end>


In [3]:
ko_max = 0
en_max = 0
for i in range(len(data)):
    ko_max = max(ko_max, len(data['ko_pre'][i].split()))
    en_max = max(en_max, len(data['en_pre'][i].split()))

print(ko_max, en_max)
max_len = max(ko_max, en_max)

106 76


In [4]:
def tokenize(corpus, maxlen, padding='post', target=False,):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=' ')
    tokenizer.fit_on_texts(corpus)

    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding=padding, maxlen=maxlen)

    print(tensor, tokenizer)
    return tensor, tokenizer

en_tensor, en_tokenizer = tokenize(data['en_pre'], maxlen=max_len)
ko_tensor, ko_tokenizer = tokenize(data['ko_pre'], maxlen=max_len)    

[[   5   53  899 ...    0    0    0]
 [   6  189 1901 ...    0    0    0]
 [  23 1094   66 ...    0    0    0]
 ...
 [   9   19   28 ...    0    0    0]
 [   3  132 6544 ...    0    0    0]
 [  50  817   17 ...    0    0    0]] <keras.preprocessing.text.Tokenizer object at 0x000002873833A280>
[[   1  816  910 ...    0    0    0]
 [   1  331    5 ...    0    0    0]
 [   1 5792    5 ...    0    0    0]
 ...
 [   1  166 1046 ...    0    0    0]
 [   1  168   13 ...    0    0    0]
 [   1    6 4191 ...    0    0    0]] <keras.preprocessing.text.Tokenizer object at 0x00000287939E45B0>


In [5]:
ko_tensor.shape, en_tensor.shape

((100000, 106), (100000, 106))

In [6]:
en_tensor

array([[   5,   53,  899, ...,    0,    0,    0],
       [   6,  189, 1901, ...,    0,    0,    0],
       [  23, 1094,   66, ...,    0,    0,    0],
       ...,
       [   9,   19,   28, ...,    0,    0,    0],
       [   3,  132, 6544, ...,    0,    0,    0],
       [  50,  817,   17, ...,    0,    0,    0]])

In [7]:
print(ko_tokenizer.word_index['<start>'])
print(ko_tokenizer.word_index['<end>'])

en_vocab_size = len(en_tokenizer.word_index) 
ko_vocab_size = len(ko_tokenizer.word_index) 

print(en_vocab_size, ko_vocab_size)

1
2
23212 26504


## Transformer 모델 설계

In [8]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, (2*int(i))/d_model)
    
    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]
    
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])
    # 짝수 인덱스 sin함수
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    # 홀수 인덱스 cos함수
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return sinusoid_table

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # 헤드의 개수
        self.num_heads = num_heads
        self.d_model = d_model

        # 헤드당 차원의 크기
        self.depth = d_model // self.num_heads

        # Q, K, V 가중치
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.linear = tf.keras.layers.Dense(d_model)
    
    def scaled_dot_product_attention(self, q, k, v, mask):
        # query shape : (batch_size, num_heads, length, d_model/num_heads)
        # key shape : (batch_size, num_heads, length, depth)
        # value shape : (batch_size, num_heads, length, depth)
        # padding mask shape : (batch_size, 1, 1, key length)
        d_k = tf.cast(k.shape[-1], tf.float32)

        # q, k dot product
        qk = tf.matmul(q, k, transpose_b=True)
        scaled_qk = qk / tf.math.sqrt(d_k)

        if mask is not None:
            scaled_qk += (mask * -1e9)

        # attention weights shape : (batch_size, num_heads, query length, key length)
        attentions = tf.nn.softmax(scaled_qk, axis=-1)

        # output shape : (batch_size, num_heads, query length, d_model/num_heads)
        out = tf.matmul(attentions, v)

        return out, attentions
    
    def split_heads(self, x):
        # x shape : (batch_size, length, d_model)
        batch_size = x.shape[0]

        # split_x shape : (batch_size, length, num_heads, d_model/num_heads)
        split_x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))

        # 각 배치의 각 헤드당 (length X depth)가 있도록 변경
        # split_x shape : (batch_size, num_heads, length, depth)
        split_x = tf.transpose(split_x, perm = [0, 2, 1, 3])

        return split_x
    
    def combine_heads(self, x):
        # x shape : (batch_size, num_heads, length, depth)
        batch_size = x.shape[0]
        # x shape to (batch_size, length, num_heads, depth)
        x = tf.transpose(x, perm = [0, 2, 1, 3])

        # combined shape : (batch_size, length, d_model)
        combined_x = tf.reshape(x, (batch_size, -1, self.d_model))
        return combined_x
    
    def call(self, q, k, v, mask):
        wq = self.wq(q)
        wk = self.wk(k)
        wv = self.wv(v)

        # split heads
        wq_split = self.split_heads(wq)
        wk_split = self.split_heads(wk)
        wv_split = self.split_heads(wv)

        # scaled dot product attention
        out, attention_weights = self.scaled_dot_product_attention(wq_split, wk_split, wv_split, mask)

        # combine heads
        out = self.combine_heads(out)

        # linear out
        out = self.linear(out)

        return out, attention_weights

In [9]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.w1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.w2 = tf.keras.layers.Dense(d_model)
    
    def call(self, x):
        out = self.w1(x)
        out = self.w2(out)

        return out

In [10]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        
        self.enc_self_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, mask):

        # Multihead Attention
        residual = x
        out = self.norm1(x)
        out, attention = self.enc_self_attn(out, out, out, mask)
        out = self.dropout(out)
        out += residual

        # Position-Wise Feed Forward Network
        residual = out
        out = self.norm2(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual

        return out, attention

In [11]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_out, causality_mask, padding_mask):
        # Masked Multihead Attention
        residual = x
        out = self.norm1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.dropout(out)
        out += residual

        # Encoder-Decoder Multihead Attention
        residual = out
        out = self.norm2(out)
        # Encoder output과 decoder output을 입력으로 넣어주는 부분
        # 마스킹을 통하여 leftward information flow를 유지해줌
        out, enc_dec_attn = self.enc_dec_attn(out, enc_out, enc_out, causality_mask)
        out = self.dropout(out)
        out += residual

        # Position-Wise Feed Forward Network
        residual = out
        out = self.norm3(out)
        out = self.ffn(out)
        out = self.dropout(out)
        out += residual

        return out, dec_attn, enc_dec_attn

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 num_heads,
                 d_ff,
                 dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(n_layers)]

    def call(self, x, mask):
        out = x

        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)
        
        return out, enc_attns

In [13]:
class Decoder(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 num_heads,
                 d_ff,
                 dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(n_layers)]

    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x

        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns

In [14]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 n_layers,
                 d_model,
                 num_heads,
                 d_ff,
                 src_vocab_size,
                 tgt_vocab_size,
                 pos_len,
                 dropout=0.1,
                 shared=True):
        super(Transformer, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)

        self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.dropout = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, num_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, num_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)
        
        self.shared = shared
        if shared: self.fc.set_weights(tf.transpose(self.dec_emb.weights))
        
    def embedding(self, emb, x):
        seq_len = x.shape[1]
        out = emb(x)

        if self.shared: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.dropout(out)
        return out
    
    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)

        dec_out, dec_attns, dec_enc_attns = self.decoder(dec_in, enc_out, causality_mask, dec_mask)

        logits = self.fc(dec_out)

        return logits, enc_attns, dec_attns, dec_enc_attns



In [15]:
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask

In [16]:
ko_vocab_size

26504

In [17]:
# import matplotlib.pyplot as plt

# batch, length = 16, 20
# src_padding = 5
# tgt_padding = 15

# src_pad = tf.zeros(shape=(batch, src_padding))
# tgt_pad = tf.zeros(shape=(batch, tgt_padding))

# sample_data2 = tf.ones(shape=(batch, length))

# sample_src = tf.concat([sample_data2, src_pad], axis=-1)
# sample_tgt = tf.concat([sample_data2, tgt_pad], axis=-1)

# enc_mask, dec_enc_mask, dec_mask = generate_mask(sample_src, sample_tgt)

# fig = plt.figure(figsize=(7,7))

# ax1 = fig.add_subplot(131)
# ax2 = fig.add_subplot(132)
# ax3 = fig.add_subplot(133)

# ax1.set_title('1) Encoder Mask')
# ax2.set_title('2) Encoder-Decoder Mask')
# ax3.set_title('3) Decoder Mask')

# ax1.imshow(enc_mask[:3, 0, 0].numpy(), cmap='Dark2')
# ax2.imshow(dec_enc_mask[0, 0].numpy(), cmap='Dark2')
# ax3.imshow(dec_mask[0, 0].numpy(), cmap='Dark2')

# plt.show()


In [18]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

learning_rate = LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1 = 0.9,
                                     beta_2 = 0.98,
                                     epsilon=1e-9)

transformer = Transformer(n_layers=4,
                          d_model=512,
                          num_heads=8,
                          d_ff = 2048,
                          src_vocab_size = len(en_tokenizer.word_index),
                          tgt_vocab_size = len(ko_tokenizer.word_index),
                          pos_len = 200,
                          dropout = 0.1,
                          shared=True)

In [19]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])
        
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

In [23]:
# Attention 시각화 함수

def visualize_attention(src, tgt, enc_attns, dec_attns, dec_enc_attns):
    def draw(data, ax, x="auto", y="auto"):
        import seaborn
        seaborn.heatmap(data, 
                        square=True,
                        vmin=0.0, 
                        vmax=1.0, 
                        cbar=False, 
                        ax=ax,
                        xticklabels=x,
                        yticklabels=y)
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Encoder Layer", layer + 1)
        for h in range(4):
            draw(enc_attns[layer][0, h, :len(src), :len(src)], axs[h], src, src)
        plt.show()
        
    for layer in range(0, 2, 1):
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        print("Decoder Self Layer", layer+1)
        for h in range(4):
            draw(dec_attns[layer][0, h, :len(tgt), :len(tgt)], axs[h], tgt, tgt)
        plt.show()

        print("Decoder Src Layer", layer+1)
        fig, axs = plt.subplots(1, 4, figsize=(20, 10))
        for h in range(4):
            draw(dec_enc_attns[layer][0, h, :len(tgt), :len(src)], axs[h], src, tgt)
        plt.show()

# 번역 생성 함수
def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocessing(sentence)

    pieces = sentence.split()
    tokens = en_tokenizer.texts_to_sequences(sentence)

    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=en_tensor.shape[-1],
                                                           padding='post')
    ids = []
    output = tf.expand_dims([ko_tokenizer.word_index['<start>']], 0)
    for i in range(ko_tensor.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = generate_masks(_input, output)

        predictions, enc_attns, dec_attns, dec_enc_attns = model(_input,
                                                                 output,
                                                                 enc_padding_mask,
                                                                 combined_mask,
                                                                 dec_padding_mask)
        predicted_id = tf.argmax(tf.math.softmax(predictions, axis=-1)[0,-1]).numpy().item()

        if ko_tokenizer.word_index['<end>'] == predicted_id:
            result = ko_tokenizer.sequences_to_texts(ids)
            return pieces, result, enc_attns, dec_attns, dec_enc_attns
        
        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)
    
    result = ko_tokenizer.sequences_to_texts(ids)
    return pieces, result, enc_attns, dec_attns, dec_enc_attns


In [24]:
def translate(sentence, model, src_tokenizer, tgt_tokenizer, plot_attention=False):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)

    print("Input : %s"%(sentence))
    print("Predicted translation : {}".format(result))

    if plot_attention:
        visualize_attention(pieces, result.split(), enc_attns, dec_attns, dec_enc_attns)

In [22]:
from tqdm import tqdm
import random

BATCH_SIZE = 64
EPOCHS = 20

examples = list(data['en_pre'].sample(5).values)

loss_ = []
for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, en_tensor.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = train_step(en_tensor[idx : idx + BATCH_SIZE],
                                                                     ko_tensor[idx : idx + BATCH_SIZE],
                                                                     transformer,
                                                                     optimizer)
        loss_.append(batch_loss)
        total_loss += batch_loss

        t.set_description_str("Epoch %2d" %(epoch + 1))
        t.set_postfix_str("Loss %.4f" %(total_loss.numpy() / (batch+1)))
    
    for example in examples:
        translate(example, transformer, en_tokenizer, ko_tokenizer)

Epoch  1: 100%|██████████| 1563/1563 [04:40<00:00,  5.56it/s, Loss nan]   
  sample_shape = np.asarray(x).shape[1:]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (62,) + inhomogeneous part.

In [25]:
for example in examples:
        translate(example, transformer, en_tokenizer, ko_tokenizer)

TypeError: 'int' object is not iterable

In [None]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  11837952  
                                                                 
 embedding_1 (Embedding)     multiple                  21606400  
                                                                 
 dropout (Dropout)           multiple                  0         
                                                                 
 encoder (Encoder)           multiple                  788992    
                                                                 
 decoder (Decoder)           multiple                  0 (unused)
                                                                 
 dense_64 (Dense)            multiple                  0 (unused)
                                                                 
Total params: 34,233,344
Trainable params: 34,233,344
N

In [None]:
examples = list(data['en_pre'].sample(5))
examples

['<start> it is an immersive product that serves a different purpose than a traditional product .  <end>',
 '<start> for accurate cost calculation , it would be better for your employees and our employees to check the facility usage status at the site .  <end>',
 '<start> thank you for your interest in our company s products , and please feel free to contact us if you have any questions before this contract .  <end>',
 '<start> no , that is not the case .  <end>',
 '<start> we are not sure whether we should accompany you with the data you have sent us .  <end>']