In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

In [2]:
import os
file_path = os.getenv('HOME')+'/aiffel/translator_seq2seq/data/fra.txt'
lines = pd.read_csv(file_path, names=['eng', 'fra', 'cc'], sep='\t')
print('전체 샘플의 수:', len(lines))
lines.sample(5)

전체 샘플의 수: 178009


Unnamed: 0,eng,fra,cc
129384,What kind of car were they driving?,Quel genre de voiture conduisaient-elles ?,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
45903,Who built the snowman?,Qui a fait le bonhomme de neige ?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
148573,He proposed that we should play baseball.,Il a proposé que nous jouions au base-ball.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
89538,Don't ask too many questions.,Ne posez pas trop de questions.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
82471,Do you want me to paint you?,Veux-tu que je te peigne ?,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [3]:
lines = lines[['eng', 'fra']][:50000]
lines.sample(5)

Unnamed: 0,eng,fra
34205,Do you know that guy?,Connaissez-vous ce type ?
22833,You're very timid.,Vous êtes très timide.
5745,I stole a gun.,J'ai volé une arme à feu.
5652,I miss my cat.,Mon chat me manque.
19021,I can handle this.,Je sais m'y prendre.


In [4]:
sos_token = '\t'
eos_token = '\n'
lines.fra = lines.fra.apply(lambda x: '\t'+x+'\n')
print('전체 샘플의 수:', len(lines))
lines.sample(5)

전체 샘플의 수: 50000


Unnamed: 0,eng,fra
36819,I've got one for you.,\tJ'en ai un pour vous.\n
15896,Keep me informed.,\tTiens-moi informé !\n
22903,Are they all ready?,\tSont-ils tous prêts ?\n
49653,I've broken my glasses.,\tJ'ai cassé mes lunettes.\n
36352,I work in a pharmacy.,\tJe travaille dans une pharmacie.\n


In [5]:
eng_tokenizer = Tokenizer(char_level=True)
eng_tokenizer.fit_on_texts(lines.eng)
input_text = eng_tokenizer.texts_to_sequences(lines.eng)
input_text[:3]

[[19, 3, 8], [10, 5, 8], [10, 5, 8]]

In [6]:
fra_tokenizer = Tokenizer(char_level=True)
fra_tokenizer.fit_on_texts(lines.fra)
target_text = fra_tokenizer.texts_to_sequences(lines.fra)
target_text[:3]

[[11, 19, 4, 1, 33, 12],
 [11, 3, 4, 13, 7, 5, 1, 33, 12],
 [11, 3, 4, 13, 7, 5, 14, 12]]

In [7]:
eng_vocab_size = len(eng_tokenizer.word_index)+1
fra_vocab_size = len(fra_tokenizer.word_index)+1
print('영어 단어장 크기:', eng_vocab_size)
print('프랑스어 단어장 크기:', fra_vocab_size)

영어 단어장 크기: 51
프랑스어 단어장 크기: 73


In [8]:
max_eng_seq_len = max([len(line) for line in input_text])
max_fra_seq_len = max([len(line) for line in target_text])
print('영어 시퀀스의 최대 길이', max_eng_seq_len)
print('프랑스어 시퀀스의 최대 길이', max_fra_seq_len)

영어 시퀀스의 최대 길이 23
프랑스어 시퀀스의 최대 길이 74


In [9]:
encoder_input = input_text
# 종료 토큰 제거
decoder_input = [[char for char in line if char != fra_tokenizer.word_index[eos_token]] for line in target_text]
# 시자가 토큰 제거
decoder_target = [[char for char in line if char != fra_tokenizer.word_index[sos_token]] for line in target_text]

In [10]:
print(decoder_input[:3])
print(decoder_target[:3])

[[11, 19, 4, 1, 33], [11, 3, 4, 13, 7, 5, 1, 33], [11, 3, 4, 13, 7, 5, 14]]
[[19, 4, 1, 33, 12], [3, 4, 13, 7, 5, 1, 33, 12], [3, 4, 13, 7, 5, 14, 12]]


In [11]:
encoder_input = pad_sequences(encoder_input, maxlen = max_eng_seq_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen = max_fra_seq_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen = max_fra_seq_len, padding='post')
print('영어 데이터의 크기(shape) :',np.shape(encoder_input))
print('프랑스어 입력데이터의 크기(shape) :',np.shape(decoder_input))
print('프랑스어 출력데이터의 크기(shape) :',np.shape(decoder_target))

영어 데이터의 크기(shape) : (50000, 23)
프랑스어 입력데이터의 크기(shape) : (50000, 74)
프랑스어 출력데이터의 크기(shape) : (50000, 74)


In [12]:
print(encoder_input[0])

[19  3  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [13]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)
print('영어 데이터의 크기(shape) :',np.shape(encoder_input))
print('프랑스어 입력데이터의 크기(shape) :',np.shape(decoder_input))
print('프랑스어 출력데이터의 크기(shape) :',np.shape(decoder_target))

영어 데이터의 크기(shape) : (50000, 23, 51)
프랑스어 입력데이터의 크기(shape) : (50000, 74, 73)
프랑스어 출력데이터의 크기(shape) : (50000, 74, 73)


In [14]:
n_of_val = 3000

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

print('영어 학습데이터의 크기(shape) :',np.shape(encoder_input_train))
print('프랑스어 학습 입력데이터의 크기(shape) :',np.shape(decoder_input_train))
print('프랑스어 학습 출력데이터의 크기(shape) :',np.shape(decoder_target_train))

영어 학습데이터의 크기(shape) : (47000, 23, 51)
프랑스어 학습 입력데이터의 크기(shape) : (47000, 74, 73)
프랑스어 학습 출력데이터의 크기(shape) : (47000, 74, 73)


## 모델 훈련하기

In [15]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

In [22]:
encoder_inputs = Input(shape=(None, eng_vocab_size), name='encoder_input')
encoder_lstm  = LSTM(units=256, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [23]:
decoder_inputs = Input(shape=(None, fra_vocab_size), name='decoder_input')
decoder_lstm = LSTM(units = 256, return_sequences = True, return_state = True)
decoder_outputs,_,_ = decoder_lstm(decoder_inputs, initial_state = encoder_states)

In [24]:
decoder_softmax_layer = Dense(fra_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

In [25]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [26]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      [(None, None, 51)]   0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      [(None, None, 73)]   0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 256), (None, 315392      encoder_input[0][0]              
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  337920      decoder_input[0][0]              
                                                                 lstm_2[0][1]               

In [127]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target, batch_size=64, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f962c099690>

## 모델 테스트 하기

In [128]:
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)
encoder_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, None, 51)]        0         
_________________________________________________________________
lstm_5 (LSTM)                [(None, 256), (None, 256) 315392    
Total params: 315,392
Trainable params: 315,392
Non-trainable params: 0
_________________________________________________________________


In [129]:
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

In [130]:
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs, outputs=[decoder_outputs] + decoder_states)
decoder_model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, None, 73)]   0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           [(None, 256)]        0                                            
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 256),  337920      input_7[0][0]                    
                                                                 input_14[0][0]             

In [134]:
eng2idx = eng_tokenizer.word_index
fra2idx = fra_tokenizer.word_index
idx2eng = eng_tokenizer.index_word
idx2fra = fra_tokenizer.index_word

In [135]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1,1,fra_vocab_size))
    target_seq[0,0,fra2idx['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:
        output_tokens, h, c=decoder_model.predict([target_seq]+ states_value)
        
        sampled_token_index = np.argmax(output_tokens[0,-1,:])
        sampled_char = idx2fra[sampled_token_index]
        
        decoded_sentence +=sampled_char
        
        if(sampled_char == '\n' or len(decoded_sentence)>max_fra_seq_len):
            stop_condition = True
        
        target_seq = np.zeros((1,1,fra_vocab_size))
        target_seq[0,0,sampled_token_index] = 1.
        
        states_value = [h, c]
        
    return decoded_sentence

In [136]:
encoder_input[8: 9]

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [1]:
import numpy as np
for seq_index in [3,50,100,300,1001]: # 입력 문장의 인덱스 (자유롭게 선택해 보세요)
    input_seq = encoder_input[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print(35 * "-")
    print('입력 문장:', lines.eng[seq_index])
    print('정답 문장:', lines.fra[seq_index][1:len(lines.fra[seq_index])-1]) # '\t'와 '\n'을 빼고 출력
    print('번역기가 번역한 문장:', decoded_sentence[:len(decoded_sentence)-1]) # '\n'을 빼고 출력

NameError: name 'encoder_input' is not defined