# 인코더 LSTM과 디코더 LSTM으로 스마트 번역기 만들기

In [27]:
# 1. 패키지 수입
import numpy as np
import pandas as pd
from time import time

from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

# 하이퍼 파라미터 (정확도나 속도에 영향을 미치는 값)
MY_HIDDEN = 128
MY_EPOCH = 500

In [5]:
# 2. 번역 데이터 불러오기
raw = pd.read_csv('trans_dataset/translate.csv', header=None)
eng_kor = raw.values.tolist() # 데이터프레임을 list로 변환
print(len(eng_kor))
eng_kor[:3]

110


[['cold', '감기'], ['come', '오다'], ['cook', '요리']]

In [17]:
# 3. 영어알파벳과 한글알파벳 리스트 만들기

e_alpha = [c for c in 'SEPabcdefghijklmnopqrstuvwxyz']
# {c:i for i,c in enumerate(e_alpha)}
k_alpha = pd.read_csv('trans_dataset/korean.csv', header=None)[0].tolist()
alpha = e_alpha + k_alpha
print('영어와 한글 알파벳 : ', alpha)
alpha_total_size = len(alpha)
print('알파벳 갯수(원핫인코딩할 size) :', alpha_total_size)
print('영어 알파벳 갯수 :', len(e_alpha))
print('한글 글자수 갯수 :', len(k_alpha))

영어와 한글 알파벳 :  ['S', 'E', 'P', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '가', '각', '간', '감', '개', '거', '것', '게', '계', '고', '관', '광', '구', '굴', '규', '그', '금', '기', '깊', '나', '날', '남', '내', '넓', '녀', '노', '놀', '농', '높', '뉴', '늦', '다', '단', '도', '동', '들', '람', '랑', '래', '램', '류', '름', '릎', '리', '많', '망', '매', '머', '먼', '멍', '메', '명', '모', '목', '무', '물', '미', '바', '반', '방', '번', '복', '부', '분', '붕', '비', '뿌', '사', '상', '색', '생', '서', '선', '소', '손', '수', '쉽', '스', '시', '식', '실', '싸', '아', '약', '얇', '어', '언', '얼', '여', '연', '오', '옥', '왼', '요', '용', '우', '운', '움', '위', '유', '은', '을', '음', '의', '이', '익', '인', '읽', '입', '자', '작', '장', '적', '제', '좋', '주', '지', '짜', '쪽', '찾', '책', '출', '칙', '크', '키', '탈', '택', '통', '파', '팔', '편', '피', '핑', '한', '합', '해', '행', '험', '회', '획', '휴', '흐']
알파벳 갯수(원핫인코딩할 size) : 171
영어 알파벳 갯수 : 29
한글 글자수 갯수 : 142


In [18]:
# 문자당 id를 갖는 dict 만들기
char_to_num = {c:i for i, c in enumerate(alpha)}
print(char_to_num)

{'S': 0, 'E': 1, 'P': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28, '가': 29, '각': 30, '간': 31, '감': 32, '개': 33, '거': 34, '것': 35, '게': 36, '계': 37, '고': 38, '관': 39, '광': 40, '구': 41, '굴': 42, '규': 43, '그': 44, '금': 45, '기': 46, '깊': 47, '나': 48, '날': 49, '남': 50, '내': 51, '넓': 52, '녀': 53, '노': 54, '놀': 55, '농': 56, '높': 57, '뉴': 58, '늦': 59, '다': 60, '단': 61, '도': 62, '동': 63, '들': 64, '람': 65, '랑': 66, '래': 67, '램': 68, '류': 69, '름': 70, '릎': 71, '리': 72, '많': 73, '망': 74, '매': 75, '머': 76, '먼': 77, '멍': 78, '메': 79, '명': 80, '모': 81, '목': 82, '무': 83, '물': 84, '미': 85, '바': 86, '반': 87, '방': 88, '번': 89, '복': 90, '부': 91, '분': 92, '붕': 93, '비': 94, '뿌': 95, '사': 96, '상': 97, '색': 98, '생': 99, '서': 100, '선': 101, '소': 102, '손': 103, '수': 104, '쉽': 105, '스': 106, '시': 107, '식': 108, '실': 109, '싸': 110,

In [25]:
eng_kor[0]
print(char_to_num['c'], char_to_num['o'], char_to_num['l'], char_to_num['d'])
print([char_to_num[c] for c in eng_kor[0][0]])
print(char_to_num['감'], char_to_num['기'])
print([char_to_num[c] for c in eng_kor[0][1]])

5 17 14 6
[5, 17, 14, 6]
32 46
[32, 46]


In [28]:
# 원핫인코딩 방법1 : 이 코드에서는 적용 힘듬. display(pd.get_dummies([5,3,6]))
# 원핫인코딩 방법2 
to_categorical([5,3,6], num_classes=10) # 0~9 인덱스 원핫인코딩

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]], dtype=float32)

In [31]:
# 원핫인코딩 방법3 : 단위행렬을 이용한 방법
np.eye(10)[[5,3,6]]

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]])

In [50]:
# 4. 인코더 입력(영어 알파벳을 숫자로 바꾼 값을 원핫인코딩) 
#    디코더 입력(한글을 숫자로 바꾼 값을 원핫인코딩)
#    디코더 출력('한글E'을 숫자로 바꾼 값)
def encoding(eng_kor=eng_kor):
    enc_in = []
    dec_in = []
    dec_out = []
    for data in eng_kor:
        # 인코딩 입력 데이터 (영어 -> 숫자 -> 원핫인코딩)
        eng = [char_to_num[c] for c in data[0]]
        eng_one = np.eye(alpha_total_size)[eng]
        enc_in.append(eng_one)
        # 디코더 입력 데이터('S한글'=>숫자=>원핫인코딩)
        kor = [char_to_num[c] for c in 'S' + data[1]]
        kor_one = np.eye(alpha_total_size)[kor]
        dec_in.append(kor_one)
        # 디코더 출력 데이터 ('한글E'->숫자)
        kor = [char_to_num[c] for c in data[1]+'E']
        dec_out.append(kor)
    return enc_in, dec_in, dec_out

In [47]:
# sample = [['wood', '나무'], ['word', '단어']]
# encoding(sample)

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 

In [53]:
# 6. 전체 번역 데이터 처리
X_enc, X_dec, Y_dec = encoding(eng_kor)

# numpy로 전환
X_enc = np.array(X_enc)
X_dec = np.array(X_dec)
Y_dec = np.array(Y_dec).reshape(-1, 3, 1) # 차원 추가

X_enc.shape, X_dec.shape, Y_dec.shape

((110, 4, 171), (110, 3, 171), (110, 3, 1))

In [58]:
# 7. 모델 구현
# 인코더 LSTM 구현
ENC_IN = Input(shape=(4,alpha_total_size))
# 출력, state_c, state_h
_, state_h, state_c = LSTM(units=MY_HIDDEN, # 128
                           return_state=True)(ENC_IN)
# 인코더와 디코더 연결
link = [state_h, state_c]

# 디코더 구현
DEC_IN = Input(shape=(3, alpha_total_size))
DEC_MID = LSTM(units=MY_HIDDEN,
               return_sequences=True)(DEC_IN, initial_state=link)
# 최종 출력층
DEC_OUT = Dense(units=alpha_total_size,
                activation='softmax')(DEC_MID)
# 모델 구조 구현
model = Model(inputs=[ENC_IN, DEC_IN], 
              outputs=[DEC_OUT])

In [61]:
# 8. 학습과정 설정 & 학습하기
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='rmsprop', metrics=['accuracy'])
begin = time()
model.fit([X_enc, X_dec],
          Y_dec,
          epochs=MY_EPOCH,
          verbose=2)
end = time()
print('학습시간 :', end-begin)

Epoch 1/500
4/4 - 3s - loss: 1.3474e-07 - accuracy: 1.0000 - 3s/epoch - 685ms/step
Epoch 2/500
4/4 - 0s - loss: 1.3258e-07 - accuracy: 1.0000 - 45ms/epoch - 11ms/step
Epoch 3/500
4/4 - 0s - loss: 1.2860e-07 - accuracy: 1.0000 - 41ms/epoch - 10ms/step
Epoch 4/500
4/4 - 0s - loss: 1.2932e-07 - accuracy: 1.0000 - 33ms/epoch - 8ms/step
Epoch 5/500
4/4 - 0s - loss: 1.3583e-07 - accuracy: 1.0000 - 52ms/epoch - 13ms/step
Epoch 6/500
4/4 - 0s - loss: 1.3655e-07 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 7/500
4/4 - 0s - loss: 1.3366e-07 - accuracy: 1.0000 - 38ms/epoch - 10ms/step
Epoch 8/500
4/4 - 0s - loss: 1.3185e-07 - accuracy: 1.0000 - 27ms/epoch - 7ms/step
Epoch 9/500
4/4 - 0s - loss: 1.3041e-07 - accuracy: 1.0000 - 38ms/epoch - 10ms/step
Epoch 10/500
4/4 - 0s - loss: 1.3366e-07 - accuracy: 1.0000 - 32ms/epoch - 8ms/step
Epoch 11/500
4/4 - 0s - loss: 1.3041e-07 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 12/500
4/4 - 0s - loss: 1.3113e-07 - accuracy: 1.0000 - 42ms/epoch - 11m

Epoch 98/500
4/4 - 0s - loss: 1.2065e-07 - accuracy: 1.0000 - 30ms/epoch - 8ms/step
Epoch 99/500
4/4 - 0s - loss: 1.1487e-07 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 100/500
4/4 - 0s - loss: 1.1307e-07 - accuracy: 1.0000 - 46ms/epoch - 11ms/step
Epoch 101/500
4/4 - 0s - loss: 1.1632e-07 - accuracy: 1.0000 - 45ms/epoch - 11ms/step
Epoch 102/500
4/4 - 0s - loss: 1.1343e-07 - accuracy: 1.0000 - 42ms/epoch - 10ms/step
Epoch 103/500
4/4 - 0s - loss: 1.1126e-07 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 104/500
4/4 - 0s - loss: 1.1379e-07 - accuracy: 1.0000 - 35ms/epoch - 9ms/step
Epoch 105/500
4/4 - 0s - loss: 1.1379e-07 - accuracy: 1.0000 - 45ms/epoch - 11ms/step
Epoch 106/500
4/4 - 0s - loss: 1.1307e-07 - accuracy: 1.0000 - 38ms/epoch - 10ms/step
Epoch 107/500
4/4 - 0s - loss: 1.1054e-07 - accuracy: 1.0000 - 38ms/epoch - 10ms/step
Epoch 108/500
4/4 - 0s - loss: 1.1524e-07 - accuracy: 1.0000 - 38ms/epoch - 9ms/step
Epoch 109/500
4/4 - 0s - loss: 1.1379e-07 - accuracy: 1.000

Epoch 194/500
4/4 - 0s - loss: 1.0079e-07 - accuracy: 1.0000 - 33ms/epoch - 8ms/step
Epoch 195/500
4/4 - 0s - loss: 1.0440e-07 - accuracy: 1.0000 - 52ms/epoch - 13ms/step
Epoch 196/500
4/4 - 0s - loss: 1.0368e-07 - accuracy: 1.0000 - 36ms/epoch - 9ms/step
Epoch 197/500
4/4 - 0s - loss: 1.0295e-07 - accuracy: 1.0000 - 48ms/epoch - 12ms/step
Epoch 198/500
4/4 - 0s - loss: 1.0115e-07 - accuracy: 1.0000 - 34ms/epoch - 8ms/step
Epoch 199/500
4/4 - 0s - loss: 9.8619e-08 - accuracy: 1.0000 - 48ms/epoch - 12ms/step
Epoch 200/500
4/4 - 0s - loss: 1.0368e-07 - accuracy: 1.0000 - 37ms/epoch - 9ms/step
Epoch 201/500
4/4 - 0s - loss: 1.0042e-07 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 202/500
4/4 - 0s - loss: 1.0476e-07 - accuracy: 1.0000 - 28ms/epoch - 7ms/step
Epoch 203/500
4/4 - 0s - loss: 9.8619e-08 - accuracy: 1.0000 - 33ms/epoch - 8ms/step
Epoch 204/500
4/4 - 0s - loss: 1.0259e-07 - accuracy: 1.0000 - 53ms/epoch - 13ms/step
Epoch 205/500
4/4 - 0s - loss: 1.0079e-07 - accuracy: 1.0000

Epoch 290/500
4/4 - 0s - loss: 9.1755e-08 - accuracy: 1.0000 - 37ms/epoch - 9ms/step
Epoch 291/500
4/4 - 0s - loss: 9.3200e-08 - accuracy: 1.0000 - 43ms/epoch - 11ms/step
Epoch 292/500
4/4 - 0s - loss: 9.2477e-08 - accuracy: 1.0000 - 37ms/epoch - 9ms/step
Epoch 293/500
4/4 - 0s - loss: 9.2116e-08 - accuracy: 1.0000 - 37ms/epoch - 9ms/step
Epoch 294/500
4/4 - 0s - loss: 9.3922e-08 - accuracy: 1.0000 - 38ms/epoch - 9ms/step
Epoch 295/500
4/4 - 0s - loss: 8.8504e-08 - accuracy: 1.0000 - 25ms/epoch - 6ms/step
Epoch 296/500
4/4 - 0s - loss: 8.9226e-08 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 297/500
4/4 - 0s - loss: 9.0310e-08 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 298/500
4/4 - 0s - loss: 9.3200e-08 - accuracy: 1.0000 - 30ms/epoch - 7ms/step
Epoch 299/500
4/4 - 0s - loss: 8.8504e-08 - accuracy: 1.0000 - 50ms/epoch - 13ms/step
Epoch 300/500
4/4 - 0s - loss: 8.9226e-08 - accuracy: 1.0000 - 42ms/epoch - 11ms/step
Epoch 301/500
4/4 - 0s - loss: 9.2116e-08 - accuracy: 1.0000

Epoch 386/500
4/4 - 0s - loss: 8.0918e-08 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 387/500
4/4 - 0s - loss: 8.3085e-08 - accuracy: 1.0000 - 49ms/epoch - 12ms/step
Epoch 388/500
4/4 - 0s - loss: 7.9112e-08 - accuracy: 1.0000 - 44ms/epoch - 11ms/step
Epoch 389/500
4/4 - 0s - loss: 8.2002e-08 - accuracy: 1.0000 - 36ms/epoch - 9ms/step
Epoch 390/500
4/4 - 0s - loss: 7.9834e-08 - accuracy: 1.0000 - 50ms/epoch - 12ms/step
Epoch 391/500
4/4 - 0s - loss: 8.0195e-08 - accuracy: 1.0000 - 37ms/epoch - 9ms/step
Epoch 392/500
4/4 - 0s - loss: 8.2002e-08 - accuracy: 1.0000 - 41ms/epoch - 10ms/step
Epoch 393/500
4/4 - 0s - loss: 8.5253e-08 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 394/500
4/4 - 0s - loss: 7.9473e-08 - accuracy: 1.0000 - 41ms/epoch - 10ms/step
Epoch 395/500
4/4 - 0s - loss: 8.2363e-08 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 396/500
4/4 - 0s - loss: 8.4891e-08 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 397/500
4/4 - 0s - loss: 8.4530e-08 - accuracy: 1.

Epoch 482/500
4/4 - 0s - loss: 8.3085e-08 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 483/500
4/4 - 0s - loss: 7.8389e-08 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 484/500
4/4 - 0s - loss: 7.7667e-08 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 485/500
4/4 - 0s - loss: 7.8389e-08 - accuracy: 1.0000 - 29ms/epoch - 7ms/step
Epoch 486/500
4/4 - 0s - loss: 7.6222e-08 - accuracy: 1.0000 - 51ms/epoch - 13ms/step
Epoch 487/500
4/4 - 0s - loss: 7.4777e-08 - accuracy: 1.0000 - 38ms/epoch - 10ms/step
Epoch 488/500
4/4 - 0s - loss: 7.3332e-08 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 489/500
4/4 - 0s - loss: 7.1887e-08 - accuracy: 1.0000 - 39ms/epoch - 10ms/step
Epoch 490/500
4/4 - 0s - loss: 7.9834e-08 - accuracy: 1.0000 - 40ms/epoch - 10ms/step
Epoch 491/500
4/4 - 0s - loss: 7.4777e-08 - accuracy: 1.0000 - 33ms/epoch - 8ms/step
Epoch 492/500
4/4 - 0s - loss: 7.6222e-08 - accuracy: 1.0000 - 49ms/epoch - 12ms/step
Epoch 493/500
4/4 - 0s - loss: 7.6944e-08 - accuracy: 1.

In [None]:
# 9. 모델 사용