In [1]:
import os
import shutil
import urllib3
import zipfile
import pandas as pd

In [2]:
http = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename= os.path.join(path, filename)

with http.request('Get', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:
    shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
lines = pd.read_csv('fra.txt', names = ['src', 'tar', 'lic'], sep='\t')
del lines['lic']
len(lines)

194513

In [4]:
lines

Unnamed: 0,src,tar
0,Go.,Va !
1,Go.,Marche.
2,Go.,Bouge !
3,Hi.,Salut !
4,Hi.,Salut.
...,...,...
194508,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
194509,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
194510,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
194511,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [5]:
sos = '\t'
eos = '\n'
lines = lines.loc[:, 'src' : 'tar']
lines = lines[0:30000]
lines.tar = lines.tar.apply(lambda x: sos + x + eos)

lines[:10]

Unnamed: 0,src,tar
0,Go.,\tVa !\n
1,Go.,\tMarche.\n
2,Go.,\tBouge !\n
3,Hi.,\tSalut !\n
4,Hi.,\tSalut.\n
5,Run!,\tCours !\n
6,Run!,\tCourez !\n
7,Run!,\tPrenez vos jambes à vos cous !\n
8,Run!,\tFile !\n
9,Run!,\tFilez !\n


In [6]:
src_vocab = set()

for line in lines.src:
    for char in line:
        src_vocab.add(char)

tar_vocab = set()

for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [7]:
print(src_vocab)

{'o', '.', 'e', '3', '?', 'D', 'B', 'N', 'j', 'a', 'g', 'G', '7', 'l', 'v', 'H', 'n', 'w', 'x', '9', 'p', 'P', 'R', 'é', '/', 'Q', 'u', ' ', 'y', '0', 'O', ':', 'Z', 'm', 'b', 'Y', '!', 'r', 'c', 'h', '-', '"', '6', "'", '4', ',', '&', 'L', 'K', 's', '1', '$', 'M', 'd', 'A', 'E', 'W', 'U', 't', '8', 'k', 'T', 'z', 'i', 'S', 'F', 'f', '2', 'X', 'I', '5', 'J', '%', 'C', 'V', 'q'}


In [8]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))

src_vocab_size = len(src_vocab) + 1
tar_vocab_size = len(tar_vocab) + 1

src_to_idx = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_idx = dict([(word, i+1) for i, word in enumerate(tar_vocab)])

print(src_to_idx)
print(tar_to_idx)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 3

In [9]:
encoder_input = []
for line in lines.src:
    encoder_input.append([src_to_idx[w] for w in line])

print(encoder_input[:5])

[[30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10], [31, 58, 10]]


In [10]:
print(encoder_input[:5])
print(lines.src[:5])

[[30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10], [31, 58, 10]]
0    Go.
1    Go.
2    Go.
3    Hi.
4    Hi.
Name: src, dtype: object


In [11]:
decoder_input = []
for line in lines.tar:
    decoder_input.append([tar_to_idx[w] for w in line])
print(decoder_input[:5])

[[1, 48, 52, 3, 4, 2], [1, 39, 52, 69, 54, 59, 56, 14, 2], [1, 28, 66, 72, 58, 56, 3, 4, 2], [1, 45, 52, 63, 72, 71, 3, 4, 2], [1, 45, 52, 63, 72, 71, 14, 2]]


In [12]:
decoder_target = []
for line in lines.tar:
    decoder_target.append([tar_to_idx[w] for w in line if w != sos ])
print(decoder_target[:5])

[[48, 52, 3, 4, 2], [39, 52, 69, 54, 59, 56, 14, 2], [28, 66, 72, 58, 56, 3, 4, 2], [45, 52, 63, 72, 71, 3, 4, 2], [45, 52, 63, 72, 71, 14, 2]]


In [13]:
print(decoder_input[:5])
print(lines.tar[:5])

[[1, 48, 52, 3, 4, 2], [1, 39, 52, 69, 54, 59, 56, 14, 2], [1, 28, 66, 72, 58, 56, 3, 4, 2], [1, 45, 52, 63, 72, 71, 3, 4, 2], [1, 45, 52, 63, 72, 71, 14, 2]]
0       \tVa !\n
1    \tMarche.\n
2    \tBouge !\n
3    \tSalut !\n
4     \tSalut.\n
Name: tar, dtype: object


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_src_len = max(len(line) for line in lines.src)
max_tar_len = max(len(line) for line in lines.tar)

encoder_input = pad_sequences(encoder_input, maxlen = max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen = max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen = max_tar_len, padding='post')

In [15]:
from tensorflow.keras.utils import to_categorical

encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

In [16]:
from keras.layers import Input, LSTM

encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(256, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [17]:
from keras.layers import Dense
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

In [18]:
from keras.models import Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile('rmsprop', 'categorical_crossentropy', ['accuracy'])

In [19]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target,
          batch_size=128,
          epochs=25,
          validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f8d7e997dd0>

In [20]:
encoder_model = Model(inputs=encoder_inputs, outputs=encoder_states)

In [21]:
decoder_state_input_h = Input(shape=(256))
decoder_state_input_c = Input(shape=(256))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]

decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs,
                      outputs=[decoder_outputs] + decoder_states)

In [22]:
idx_to_src = dict((i, char) for char, i in src_to_idx.items())
idx_to_tar = dict((i, char) for char, i in tar_to_idx.items())

In [23]:
import numpy as np
def predict_decode(input_seq):
    states_value = encoder_model.predict(input_seq)

    # 교사 강요를 위한 빈값 + 시작 토큰
    target_seq = np.zeros((1, 1, tar_vocab_size))
    target_seq[0, 0, tar_to_idx[sos]] = 1

    stop = False
    decoded_sentence = ""
    while not stop:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = idx_to_tar[sampled_token_index]
        
        decoded_sentence += sampled_char
        
        if sampled_char == eos or len(decoded_sentence) > max_tar_len:
            stop = True

        target_seq = np.zeros((1, 1, tar_vocab_size))
        target_seq[0, 0, sampled_token_index] = 1. 
        # 교사강요

        states_value = [h, c]

    return decoded_sentence

In [24]:
for seq_index in [100, 200, 300, 400]:
    input_seq = encoder_input[seq_index : seq_index + 1]
    decoded_sentence = predict_decode(input_seq)

    print('입력 :', lines.src[seq_index])
    print('정답 :', lines.tar[seq_index][1:len(lines.tar[seq_index]) -1])
    print('번역 :', decoded_sentence[1: len(decoded_sentence) -1], '\n')

입력 : Hop in.
정답 : Montez.
번역 : ours. 

입력 : Call us.
정답 : Appelle-nous !
번역 : ppelle-moi ! 

입력 : Help me!
정답 : Aide-moi !
번역 : idez-nous ! 

입력 : Sign up.
정답 : Inscrivez-vous.
번역 : idez-vous. 



# attention

In [37]:
encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(256, return_state=True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [38]:
import tensorflow as tf
from keras.layers import Attention

decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# tf.newaixs 는 축 추가해줌
# S_ 는 은닉상태와 디코더의 최종 출력을 연결해줌
# -1은 마지막 끝맺음 기호 넣은거 때문인듯?
S_ = tf.concat([state_h[:, tf.newaxis, :], decoder_outputs[:, :-1, :]], axis=1)

attention = Attention()
context_vector = attention([S_, encoder_outputs])
concat = tf.concat([decoder_outputs, context_vector], axis=-1)
decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(concat)

In [39]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [40]:
model.fit(x = [encoder_input, decoder_input], y=decoder_target,
         batch_size = 128,
         epochs=25,
         validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f8c82415150>

In [41]:
encoder_model = Model(inputs=encoder_inputs,
                      outputs=[encoder_outputs, encoder_states])

In [42]:
decoder_state_input_h = Input(shape=(256))
decoder_state_input_c = Input(shape=(256))

estate_h = Input(shape=(256))
encoder_outputs = Input(shape=(256))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]

# attention context_vector
S_ = tf.concat([estate_h[:, tf.newaxis, :], decoder_outputs[:, :-1, :]], axis=1)
# EOS 고려인듯?
context_vector = attention([S_, encoder_outputs])
decoder_concat = tf.concat([decoder_outputs, context_vector], axis=-1)

decoder_outputs = decoder_softmax_layer(decoder_concat)

decoder_model = Model(inputs=[decoder_inputs, estate_h, encoder_outputs] + decoder_states_inputs,
                      outputs=[decoder_outputs] + decoder_states)

In [50]:
def predict_decode(input_seq):
    outputs_input, states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, tar_vocab_size))
    target_seq[0, 0, tar_to_idx[sos]] = 1

    stop=False
    decoded_sentence = ""
    
    while not stop:
        # estate는 state_h만 쓰므로 states_value[0]과 동일
        # 처음에는 decoder_states값이 없으므로 encoder_states 넣어주고 추후 변경
        output_tokens, h, c = decoder_model.predict([target_seq, states_value[0], outputs_input] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = idx_to_tar[sampled_token_index]

        decoded_sentence += sampled_char

        if sampled_char == eos or len(decoded_sentence) > max_tar_len:
            stop = True

        target_seq = np.zeros((1, 1, tar_vocab_size))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = h, c

    return decoded_sentence

In [51]:
import numpy as np
for seq_index in [100, 200, 300, 400]:
    input_seq = encoder_input[seq_index : seq_index + 1]
    decoded_sentence = predict_decode(input_seq)

    print('입력:', lines.src[seq_index])
    print('정답:', lines.tar[seq_index][1:len(lines.tar[seq_index])-1])
    print('번역:', decoded_sentence[:len(decoded_sentence)-1], '\n')

AssertionError: ignored