## 데이터 불러오기

In [235]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import re    
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking, Dropout
from tensorflow.keras.models import Model

In [236]:
import os
file_path = os.getenv('HOME')+'/aiffel/translator_seq2seq/data/fra.txt'
lines = pd.read_csv(file_path, names=['eng', 'fra', 'cc'], sep='\t')
print('전체 샘플의 수:', len(lines))
lines.sample(5)

전체 샘플의 수: 178009


Unnamed: 0,eng,fra,cc
82037,Are you going to share that?,Est-ce que vous allez partager ça ?,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
91168,I helped my father yesterday.,"Hier, j’ai aidé mon père.",CC-BY 2.0 (France) Attribution: tatoeba.org #2...
164093,I always have a couple of beach towels in my car.,J'ai toujours une paire de serviettes de plage...,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
76062,He was desperate to escape.,Il voulait désespérément s'échapper.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
138489,He seemed disappointed at the results.,Il avait l'air déçu des résultats.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [237]:
lines = lines[['eng', 'fra']][60000:93000]
lines.sample(5)

Unnamed: 0,eng,fra
66315,What's your real purpose?,Quel est votre réel objectif ?
60795,Don't use too much water.,N'utilisez pas trop d'eau.
67135,Your French is excellent.,Votre français est excellent.
92347,I'm still intimidated by you.,Tu m'intimides toujours.
91436,I only know what you tell me.,Je ne sais que ce que vous me dites.


In [238]:
lines_np_eng= lines['eng'].to_numpy()
lines_np_fra= lines['fra'].to_numpy()
lines_np_eng

array(["You've got to help them.", "You've got to help them.",
       "You've got to stop this.", ..., "Obviously, there's a problem.",
       'Oil is extracted from olives.', 'Old habits are hard to break.'],
      dtype=object)

## 정제, 정규화, 전처리 하기

### 소문자 변경 후 구두점 분리 함수

In [239]:
sos_token = '<start> '
eos_token = ' <end>'

def preprocess_line(line, plus_token = True):
    # 소문자로 변경하기
    line = line.lower().strip()
    # 구두점(Punctuation)을 단어와 분리하기
    line = re.sub(r"([?.!,¿])", r" \1 ", line)
    line = re.sub(r'[" "]+', " ", line)
    line = re.sub(r"[^a-zA-Z?.!,¿]+", " ", line)

    line = line.strip()
    
    if plus_token == True:
        line = sos_token + line + eos_token
    
    return line

### 띄어쓰기 단위로 토큰화 함수

In [240]:
def tokenize(corpus):
    tokenizer = Tokenizer(
        num_words=7000,  
        filters=' ',   
        oov_token="<unk>"  
    )
    tokenizer.fit_on_texts(corpus)  

    tensor = tokenizer.texts_to_sequences(corpus)   

    return tensor, tokenizer

### 영어, 프랑스어 전처리하기

In [241]:
eng_lines = []
fra_lines = []

# eng_lines.append(lines.eng.apply(lambda x : preprocess_line(x,plus_token = False)))
# fra_lines.append(lines.fra.apply(lambda x : preprocess_line(x),))

for eng, fra in zip(lines.eng, lines.fra):
    if len(eng) == 0: continue
    if len(fra) == 0: continue   
        
    eng_lines.append(preprocess_line(eng, plus_token = False))
    fra_lines.append(preprocess_line(fra))

In [242]:
np.shape(eng_lines)

(33000,)

In [243]:
eng_tensor, eng_tokenizer = tokenize(eng_lines)
fra_tensor, fra_tokenizer = tokenize(fra_lines)
fra_tensor[:10]

[[2, 8, 313, 36, 135, 4, 3],
 [2, 15, 115, 36, 135, 4, 3],
 [2, 15, 115, 159, 229, 12, 4, 3],
 [2, 8, 313, 159, 229, 12, 4, 3],
 [2, 15, 115, 1760, 50, 3],
 [2, 8, 313, 1760, 50, 3],
 [2, 15, 19, 52, 83, 27, 206, 4, 3],
 [2, 8, 19, 55, 83, 27, 206, 4, 3],
 [2, 8, 55, 47, 223, 4, 3],
 [2, 15, 52, 47, 223, 4, 3]]

### input, target 설정

In [245]:
encoder_input = eng_tensor
# 종료 토큰 제거
decoder_input = [[char for char in line if char != fra_tokenizer.word_index['<end>']] for line in fra_tensor]
# 시작 토큰 제거
decoder_target =[[char for char in line if char != fra_tokenizer.word_index['<start>']] for line in fra_tensor]

### padding 추가

In [246]:
def pad_tensor(tensor):
    total_data_text = list(tensor)
    num_tokens = [len(tokens) for tokens in total_data_text]
    max_tokens = max(num_tokens)
#     max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
    maxlen = int(max_tokens)
    tensor = pad_sequences(tensor, padding='post', maxlen=maxlen)  
    return tensor

In [247]:
encoder_input = pad_tensor(encoder_input)
decoder_input = pad_tensor(decoder_input)
decoder_target = pad_tensor(decoder_target)
print('영어 데이터의 크기(shape) :',np.shape(encoder_input))
print('프랑스어 입력데이터의 크기(shape) :',np.shape(decoder_input))
print('프랑스어 출력데이터의 크기(shape) :',np.shape(decoder_target))

영어 데이터의 크기(shape) : (33000, 11)
프랑스어 입력데이터의 크기(shape) : (33000, 20)
프랑스어 출력데이터의 크기(shape) : (33000, 20)


In [248]:
eng_vocab_size = len(eng_tokenizer.word_index)+1
fra_vocab_size = len(fra_tokenizer.word_index)+1

max_eng_seq_len = encoder_input.shape[1]
max_fra_seq_len = decoder_input.shape[1]

11
20


In [249]:
print('전체 샘플의 수 :',len(lines))
print('영어 단어장의 크기 :', eng_vocab_size)
print('프랑스어 단어장의 크기 :', fra_vocab_size)
print('영어 시퀀스의 최대 길이', max_eng_seq_len)
print('프랑스어 시퀀스의 최대 길이', max_fra_seq_len)

전체 샘플의 수 : 33000
영어 단어장의 크기 : 5932
프랑스어 단어장의 크기 : 8507
영어 시퀀스의 최대 길이 11
프랑스어 시퀀스의 최대 길이 20


## train, test dataset 나누기

In [250]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)

encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [251]:
n_of_val = 3000

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

print(encoder_input_train.shape)
print(decoder_input_train.shape)
print(decoder_target_train.shape)
print(encoder_input_test.shape)
print(decoder_input_test.shape)
print(decoder_target_test.shape)

(30000, 11)
(30000, 20)
(30000, 20)
(3000, 11)
(3000, 20)
(3000, 20)


## 임베딩 층(Embedding layer) 사용하기

### 인코더 

In [257]:
embedding_size = 512
hidden_size = 512
# 인코더에서 사용할 임베딩 층 사용 예시
encoder_inputs = Input(shape=(None, ), name='encoder_input')
enc_emb =  Embedding(eng_vocab_size, embedding_size,
                    input_length=max_eng_seq_len)(encoder_inputs)
enc_masking = Masking(mask_value=0.0)(enc_emb)
encoder_lstm = LSTM(hidden_size, dropout = 0.5, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

### 디코더

In [258]:
decoder_inputs = Input(shape=(None, ), name='decoder_input')
dec_emb =  Embedding(fra_vocab_size, embedding_size)(decoder_inputs)
dec_masking = Masking(mask_value=0.0)(dec_emb)
decoder_lstm = LSTM(hidden_size, dropout = 0.5, return_sequences = True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state = encoder_states)

In [259]:
decoder_softmax_layer = Dense(fra_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

In [260]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

In [261]:
model.summary()

Model: "model_14"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_21 (Embedding)        (None, None, 512)    3037184     encoder_input[0][0]              
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, None, 512)    4355584     decoder_input[0][0]              
___________________________________________________________________________________________

In [262]:
model.fit(x=[encoder_input_train, decoder_input_train], 
          y=decoder_target_train, 
          validation_data = ([encoder_input_test, decoder_input_test], 
                             decoder_target_test),
          batch_size=32, 
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f790b6ffa10>

## 모델 구현하기

### 인코더

In [263]:
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)
encoder_model.summary()

Model: "model_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   [(None, None)]            0         
_________________________________________________________________
embedding_21 (Embedding)     (None, None, 512)         3037184   
_________________________________________________________________
masking_18 (Masking)         (None, None, 512)         0         
_________________________________________________________________
lstm_18 (LSTM)               [(None, 512), (None, 512) 2099200   
Total params: 5,136,384
Trainable params: 5,136,384
Non-trainable params: 0
_________________________________________________________________


### 디코더

In [267]:
decoder_state_input_h = Input(shape=(embedding_size,))
decoder_state_input_c = Input(shape=(embedding_size,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = Embedding(fra_vocab_size, embedding_size)(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state = decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_softmax_layer(decoder_outputs2)

In [268]:
eng2idx = eng_tokenizer.word_index
fra2idx = fra_tokenizer.word_index
idx2eng = eng_tokenizer.index_word
idx2fra = fra_tokenizer.index_word

In [269]:
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs, outputs=[decoder_outputs2] + decoder_states2)
decoder_model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_26 (Embedding)        (None, None, 512)    4355584     decoder_input[0][0]              
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 512)]        0                                            
___________________________________________________________________________________________

In [270]:
def decode_sequence(input_seq):
    # 입력으로부터 인코더의 상태를 얻음
    states_value = encoder_model.predict(input_seq)

    # <start>에 해당하는 원-핫 벡터 생성
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = fra2idx['<start>']
    
    stop_condition = False
    decoded_sentence = ""

    # stop_condition이 True가 될 때까지 루프 반복
    while not stop_condition:
        # 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # 예측 결과를 문자로 변환
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = idx2fra[sampled_token_index]

        # 현재 시점의 예측 문자를 예측 문장에 추가
        decoded_sentence += ' '+sampled_char

        # <eos>에 도달하거나 최대 길이를 넘으면 중단.
        if (sampled_char == '<end>' or
           len(decoded_sentence) > max_fra_seq_len):
            stop_condition = True

        # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
        states_value = [h, c]

    return decoded_sentence

In [271]:
# 원문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2src(input_seq):
    temp=''
    for i in input_seq:
        if(i!=0):
            temp = temp + idx2eng[i]+' '
    return temp

In [272]:
# 번역문의 정수 시퀀스를 텍스트 시퀀스로 변환
def seq2tar(input_seq):
    temp=''
    for i in input_seq:
        if((i!=0 and i!=fra2idx['<start>']) and i!=fra2idx['<end>']):
            temp = temp + idx2fra[i] + ' '
    return temp

## 모델 평가하기

In [273]:
for seq_index in [1,201,501,1004,2015]:
    input_seq = encoder_input_test[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(35 * "-")
    print('입력 문장:', seq2src(encoder_input_test[seq_index]))
    print('정답 문장:', seq2tar(decoder_input_test[seq_index]))
    print('번역기가 번역한 문장:', decoded_sentence[:len(decoded_sentence)-1])

-----------------------------------
입력 문장: i want to ask you something . 
정답 문장: je veux vous demander quelque chose . 
번역기가 번역한 문장:  je veux te vous quelqu
-----------------------------------
입력 문장: stop playing hard to get . 
정답 문장: cessez de faire ceux qui ne sont pas int ress s ! 
번역기가 번역한 문장:  arr de de faire fair
-----------------------------------
입력 문장: you never have any money . 
정답 문장: tu ne disposes jamais d aucun argent . 
번역기가 번역한 문장:  tu n jamais d d d d 
-----------------------------------
입력 문장: tom slipped and nearly fell . 
정답 문장: tom <unk> et <unk> tomber . 
번역기가 번역한 문장:  tom a les et en en 
-----------------------------------
입력 문장: tom kept the window closed . 
정답 문장: tom garda la fen tre ferm e . 
번역기가 번역한 문장:  tom tom la la la la l


## 마무리

입력 문장: i want to ask you something .   
정답 문장: je veux vous demander quelque chose .   
번역기가 번역한 문장:  je veux te vous quelqu  

입력 문장: stop playing hard to get .   
정답 문장: cessez de faire ceux qui ne sont pas int ress s !  <br>
번역기가 번역한 문장:  arr de de faire fair  

입력 문장: you never have any money .   
정답 문장: tu ne disposes jamais d aucun argent .   
번역기가 번역한 문장:  tu n jamais d d d d   

입력 문장: tom slipped and nearly fell .   
정답 문장: tom <unk> et <unk> tomber .   
번역기가 번역한 문장:  tom a les et en en   

입력 문장: tom kept the window closed .   
정답 문장: tom garda la fen tre ferm e .   
번역기가 번역한 문장:  tom tom la la la la l  

번역기가 번역한 문장이 완벽하지는 않지만, 어느정도 비슷한 결과를 보이고 있습니다.  
학습 데이터가 더 많거나, 전처리 부분에 신경을 많이 쓴다면 더 좋은 결과를 얻을 것이라고 생각 됩니다.  