## Going Deeper(NLP)_RS5
## 12. Seq2seq으로 번역기 만들기

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager.findfont(font)

print("완료!")

완료!


### 1. 데이터 전처리
#### 1. 데이터 준비하기

In [9]:
# 라이브러리 import

import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from konlpy.tag import Mecab

import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

import time
import re
import os
import io

print(tf.__version__)

2.6.0


In [10]:
data_dir = os.getenv('HOME')+'/aiffel/transformer/data'
path_kor = data_dir+"/korean-english-park.train.ko"
path_eng = data_dir+"/korean-english-park.train.en"

def clean_corpus(path_kor, path_eng):
    with open(path_kor, "r") as f:
        kor = f.read().splitlines()
    with open(path_eng, "r") as f:
        eng = f.read().splitlines()
    assert len(kor) == len(eng)

    raw = zip(kor, eng)
    corpus_cleaned = set(raw)

    return corpus_cleaned

corpus_cleaned = clean_corpus(path_kor, path_eng)

In [11]:
corpus_kor, corpus_eng = zip(*corpus_cleaned)
print(len(corpus_kor), len(corpus_eng))

78968 78968


#### 2. 데이터 정제하기
* 영문 대문자 -> 소문자
* 알파벳, 문장부호, 한글 외 제거
* 문장부호 양 옆 공백
* 불필요한 공백은 제거
* 전처리 과정에서 문장의 시작 문자 <start>, 종료 문자 <end> 를 붙여줌
###### Encoder에 들어갈 입력 문장의 전처리에는 굳이 필요하지 않지만, Decoder의 입력 문장과 라벨로 사용할 출력 문장에는 꼭 필요
###### Decoder는 첫 입력으로 사용할 시작 토큰과 문장생성 종료를 알리는 끝 토큰이 반드시 필요
* set 데이터형이 중복을 허용하지 않는다는 것을 활용해 중복된 데이터를 제거
###### 데이터의 병렬 쌍이 흐트러지지 않게 주의
###### 중복을 제거한 데이터를 cleaned_corpus 에 저장
* 한글에 적용할 수 있는 정규식을 추가
* 타겟 언어인 영문엔 <start> 토큰과 <end> 토큰을 추가하고 split() 함수를 이용하여 토큰화
* 한글 토큰화는 KoNLPy의 mecab 클래스를 사용
* cleaned_corpus로부터 토큰의 길이가 40 이하인 데이터를 선별하여 eng_corpus와 kor_corpus를 각각 구축 

In [17]:
def preprocess_sentence(sentence, token_s=False, token_e=False):

    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!가-힣ㄱ-ㅎㅏ-ㅣ0-9]+", " ", sentence)

    sentence = sentence.strip()

    if token_s:
        sentence = '<start> ' + sentence

    if token_e:
        sentence += ' <end>'
    
    return sentence

In [18]:
corpus_enc = []
corpus_dec = []
mecab = Mecab()

for kor, eng in zip(corpus_kor, corpus_eng):
    temp_kor = preprocess_sentence(kor)
    temp_eng = preprocess_sentence(eng, token_s=True, token_e=True)
    temp_kor = mecab.morphs(temp_kor)
    temp_eng = temp_eng.split()

    corpus_enc.append(temp_kor)
    corpus_dec.append(temp_eng)
    

print('Korean data:', len(corpus_enc))
print('English data:', len(corpus_dec))
print("Korean:", corpus_enc[500])
print("English:", corpus_dec[500])

Korean data: 78968
English data: 78968
Korean: ['남반구', '에', '위치', '한', '호주', '는', '12', '월', '부터', '본격', '적', '인', '여름', '시즌', '이', '다', '.']
English: ['<start>', 'peak', 'beach', 'season', 'is', 'december', 'february', 'summer', 'in', 'the', 'southern', 'hemisphere', '.', '<end>']


#### 3. 토큰화
* 정제된 텍스트를 tokenize() 함수를 사용해 토큰화하고 텐서로 변환
* 변환된 텐서를 80%의 훈련 데이터와 20%의 검증 데이터로 분리

In [19]:
def tokenize(corpus):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=10000,
        filters='',
        oov_token="<unk>")
    
    tokenizer.fit_on_texts(corpus)

    tensor = tokenizer.texts_to_sequences(corpus)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    print(tensor, tokenizer)
    return tensor, tokenizer

In [20]:
tensor_enc, tokenizer_enc = tokenize(corpus_enc)

tensor_dec, tokenizer_dec = tokenize(corpus_dec)

for idx in tokenizer_enc.index_word:
    print(idx, ":", tokenizer_enc.index_word[idx])

    if idx >= 10: break

for idx in tokenizer_dec.index_word:
    print(idx, ":", tokenizer_dec.index_word[idx])

    if idx >= 10: break


[[  91    1 1616 ...    0    0    0]
 [ 107    7 3048 ...    0    0    0]
 [ 666    7    1 ...    0    0    0]
 ...
 [1380   21 3014 ...    0    0    0]
 [  26    5 2666 ...    0    0    0]
 [  26    5 1489 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7fa8b1f45f70>
[[   4   31 3495 ...    0    0    0]
 [   4    8 2174 ...    0    0    0]
 [   4    1    1 ...    0    0    0]
 ...
 [   4 1421   16 ...    0    0    0]
 [   4   22 3141 ...    0    0    0]
 [   4  633  157 ...    0    0    0]] <keras_preprocessing.text.Tokenizer object at 0x7fa893bc0a90>
1 : <unk>
2 : .
3 : 다
4 : 이
5 : 는
6 : 을
7 : 의
8 : 에
9 : 은
10 : 를
1 : <unk>
2 : the
3 : .
4 : <start>
5 : <end>
6 : to
7 : of
8 : a
9 : in
10 : and


#### Step 4. 모델 설계
* Attention 기반 Seq2seq 모델을 설계
* Dropout 모듈을 추가하면 성능이 더 좋아짐
* Embedding Size와 Hidden Size는 실험을 통해 적당한 값을 지정

In [21]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.w_dec = tf.keras.layers.Dense(units)
        self.w_enc = tf.keras.layers.Dense(units)
        self.w_com = tf.keras.layers.Dense(1)
    
    def call(self, h_enc, h_dec):
        # h_enc shape: [batch x length x units]
        # h_dec shape: [batch x units]

        h_enc = self.w_enc(h_enc)
        h_dec = tf.expand_dims(h_dec, 1)
        h_dec = self.w_dec(h_dec)

        score = self.w_com(tf.nn.tanh(h_dec + h_enc))
        
        attn = tf.nn.softmax(score, axis=1)

        context_vec = attn * h_enc
        context_vec = tf.reduce_sum(context_vec, axis=1)

        return context_vec, attn

print("슝~")

슝~


In [22]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units,
                                       return_sequences=True)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.gru(out)
        
        return out

In [23]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units,
                                       return_sequences=True,
                                       return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, h_dec, enc_out):
        context_vec, attn = self.attention(enc_out, h_dec)

        out = self.embedding(x)
        out = tf.concat([tf.expand_dims(context_vec, 1), out], axis=-1)
        
        out, h_dec = self.gru(out)
        out = tf.reshape(out, (-1, out.shape[2]))
        out = self.fc(out)

        return out, h_dec, attn

In [30]:

BATCH_SIZE     = 64
SRC_VOCAB_SIZE = len(tokenizer_enc.index_word) + 1
TGT_VOCAB_SIZE = len(tokenizer_dec.index_word) + 1

units         = 128
embedding_dim = 64

encoder = Encoder(SRC_VOCAB_SIZE, embedding_dim, units)
decoder = Decoder(TGT_VOCAB_SIZE, embedding_dim, units)

# sample input
sequence_len = 30

sample_enc = tf.random.uniform((BATCH_SIZE, sequence_len))
sample_output = encoder(sample_enc)

print ('Encoder Output:', sample_output.shape)

sample_state = tf.random.uniform((BATCH_SIZE, units))

sample_logits, h_dec, attn = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                     sample_state, sample_output)

print ('Decoder Output:', sample_logits.shape)
print ('Decoder Hidden State:', h_dec.shape)
print ('Attention:', attn.shape)

ResourceExhaustedError: OOM when allocating tensor with shape[52343,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:RandomUniform]

* 훈련하기: 옵티마이저 & Loss

In [25]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

print("슝~")

슝~


#### Step 5. 훈련하기
* 훈련하기: train_step 구현하기

In [28]:
# @tf.function
def train_step(src, tgt, encoder, decoder, optimizer, tokenizer_dec):
    bsz = src.shape[0]
    loss = 0

    with tf.GradientTape() as tape:
        enc_out = encoder(src)
        h_dec = enc_out[:, -1]
        
        dec_src = tf.expand_dims([tokenizer_dec.word_index['<start>']] * bsz, 1)

        for t in range(1, tgt.shape[1]):
            pred, h_dec, _ = decoder(dec_src, h_dec, enc_out)

            loss += loss_function(tgt[:, t], pred)
            dec_src = tf.expand_dims(tgt[:, t], 1)
        
    batch_loss = (loss / int(tgt.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss

print("슝~")

슝~


* 훈련 시작

In [29]:
from tqdm import tqdm    # tqdm
import random

EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    
    idx_list = list(range(0, tensor_enc.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm(idx_list)    # tqdm

    for (batch, idx) in enumerate(t):
        batch_loss = train_step(tensor_enc[idx:idx+BATCH_SIZE],
                                tensor_dec[idx:idx+BATCH_SIZE],
                                encoder,
                                decoder,
                                optimizer,
                                tokenizer_dec)
    
        total_loss += batch_loss
        
        t.set_description_str('Epoch %2d' % (epoch + 1))    # tqdm
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))    # tqdm

  0%|          | 0/1234 [00:10<?, ?it/s]


ResourceExhaustedError: OOM when allocating tensor with shape[1024,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Split] name: split