# 필요한 라이브러리 설치

In [1]:
!pip install jamo
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jamo
  Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Installing collected packages: jamo
Successfully installed jamo-0.4.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 7.1 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.0


In [2]:
!wget https://github.com/kaniblu/hangul-utils/raw/master/hangul_utils/unicode.py

--2022-05-31 06:42:25--  https://github.com/kaniblu/hangul-utils/raw/master/hangul_utils/unicode.py
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kaniblu/hangul-utils/master/hangul_utils/unicode.py [following]
--2022-05-31 06:42:26--  https://raw.githubusercontent.com/kaniblu/hangul-utils/master/hangul_utils/unicode.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8775 (8.6K) [text/plain]
Saving to: ‘unicode.py’


2022-05-31 06:42:26 (87.5 MB/s) - ‘unicode.py’ saved [8775/8775]



# 라이브러리 로드

In [3]:
# 딱히 쓰이지 않는 import가 많습니다
import os
import numpy as np
import pandas as pd
import tensorflow
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras.models import Sequential,load_model,Model
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Dense,ELU,MaxPooling2D,LSTM,Activation,Dropout,Concatenate,Average,Input,Permute,BatchNormalization,Multiply,SimpleRNN,Flatten,Reshape,Conv2D,Conv1D,Conv2DTranspose,UpSampling2D,LocallyConnected2D,AveragePooling2D,Conv1D,MaxPooling1D,Add,LeakyReLU,Lambda,Multiply,Attention
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras import backend as K
from tensorflow.keras.losses import mse, binary_crossentropy, kl_divergence
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam, RMSprop,Nadam
import json
import random
import pickle
from jamo import h2j, j2hcj
import gc
he = tf.keras.initializers.HeNormal()
from unicode import join_jamos # from https://github.com/kaniblu/hangul-utils

# 모델 정의

In [4]:
LEN=18 # 단어의 자모음 배열의 최대 길이(최대 길이가 길수록 메모리 요구량이 커짐)
VEC=15 # 자모음 벡터의 길이
CEL=128 # 인코딩된 벡터의 길이
BAT=256 # 미니배치 크기

In [5]:
def Attention2D(ilay):
    lay = Permute([2,1])(ilay)
    lay = Dense(lay.shape[-1], activation='softmax')(lay)
    lay = Dropout(0.2)(lay)
    lay = Permute([2,1])(lay)
    lay = Multiply()([lay,ilay])
    return lay

In [6]:
def MultiBlock(ilay,hidden,out,num):
    lay = ilay
    lay = Permute([2,1])(lay)
    laylist = []
    for i in range(num):
        hlay = lay
        hlay = Conv1D(hidden,kernel_size=3,padding='same',kernel_initializer=he)(hlay)
        hlay = BatchNormalization()(hlay)
        rlay = hlay = Activation('elu')(hlay)
        for _ in range(1):
          hlay = Conv1D(hidden,kernel_size=3,padding='same', kernel_initializer=he)(hlay)
          hlay = BatchNormalization()(hlay)
          hlay = Dropout(0.5)(hlay)
          hlay = ELU()(hlay)
          hlay = Add()([hlay,rlay])
          hlay = Attention2D(hlay)
        laylist.append(hlay)
    lay = Average()(laylist)
    lay = Permute([2,1])(lay)
    lay = Dense(out,kernel_initializer=he)(lay)
    lay = BatchNormalization()(lay)
    lay = ELU()(lay)
    return lay

In [7]:
def Encoder(Cel,Len,Vec):
    lay = ilay = Input(shape=(Len,Vec,),name='encoder_input')
    lay = MultiBlock(lay,8,12,3)
    lay = MultiBlock(lay,12,20,2)
    lay = MultiBlock(lay,20,32,2)
    lay = BatchNormalization()(lay)
    lay = Flatten()(lay)
    lay = Dense(Cel, activation='linear',kernel_initializer=he,name='encoder_output')(lay)
    m = Model(inputs=ilay,outputs=lay,name='encoder')
    m.summary()
    return m

In [8]:
def Decoder(Len,Cel,Vec):
    lay = ilay = Input(shape=(Cel,), name='decoder_input')
    models = []
    lay = Dense(Len * Len * 2, activation='elu',kernel_initializer=he)(ilay)
    lay = Reshape((Len,Len * 2,))(lay)
    rlay = lay = BatchNormalization()(lay)
    for _ in range(4): # 앙상블
      lay = Dropout(0.3)(lay)
      lay = Dense(Len * 2, activation='elu',kernel_initializer=he)(lay)
      lay = BatchNormalization()(lay)
      
      lay = Permute([2,1])(lay)
      lay = Dropout(0.3)(lay)
      lay = Dense(Len, activation='elu',kernel_initializer=he)(lay)
      lay = BatchNormalization()(lay)
      lay = Permute([2,1])(lay)

      lay = Dropout(0.3)(lay)
      lay = Add()([rlay, lay])
      lay = Dense(Len * 2, activation='elu',kernel_initializer=he)(lay)
      lay = BatchNormalization()(lay)
      lay = Dropout(0.3)(lay)
      models.append(lay)
    
    lay = Average()(models)
    lay = Dropout(0.3)(lay)
    lay = Dense(Len,kernel_initializer=he)(lay)
    lay = ELU()(lay)
    lay = Permute([2,1])(lay)
    lay = BatchNormalization()(lay)

    lay = Dropout(0.3)(lay)
    lay = Dense(Vec,kernel_initializer=he)(lay)
    lay = ELU()(lay)
    lay = BatchNormalization()(lay)
    lay = Dense(Vec,activation='sigmoid')(lay)
    m = Model(inputs=ilay,outputs=lay,name='decoder_output')
    m.summary()
    return m

In [9]:
class Network:
    def __init__(self,Len,Cel,Vec):
        if os.path.isfile("./encoder.h5"):
            self.encoder = load_model("./encoder.h5")
            self.encoder.trainable = True
        else:
            self.encoder = Encoder(Cel,Len,Vec)
        if os.path.isfile("./decoder.h5"):
            self.decoder = load_model("./decoder.h5")
            self.decoder.trainable = True
        else:
            self.decoder = Decoder(Len,Cel,Vec)
        if(os.path.isfile('./model.h5')):
            self.model = load_model('./model.h5')
        else:
            ilay = Input(Len,Vec,)
            encoder = self.encoder(ilay)
            decoder = self.decoder(encoder)
            self.model = Model(inputs=ilay,outputs=decoder)
        self.model.compile(optimizer=tfa.optimizers.AdaBelief(lr=0.03),loss='log_cosh',metrics=['mae','mse', 'acc','binary_crossentropy'])

In [10]:
net = Network(LEN,CEL,VEC)
net.model.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 18, 15)]     0           []                               
                                                                                                  
 permute (Permute)              (None, 15, 18)       0           ['encoder_input[0][0]']          
                                                                                                  
 conv1d (Conv1D)                (None, 15, 8)        440         ['permute[0][0]']                
                                                                                                  
 conv1d_2 (Conv1D)              (None, 15, 8)        440         ['permute[0][0]']                
                                                                                            

  super().__init__(name, **kwargs)


# jamodict 정의

## 겹받침을 포함하지 않은 경우

In [None]:
'''
jamodict = {#[양순음,치조음,경구개음,연구개음,후음,예사소리(0)된소리(0.5)거센소리(1),파열음,파찰음,마찰음,비음,유음,ㅓ(0.33))ㅣ(0.67)ㅏ(1), ㅜ(0.33)ㅡ(0.67)ㅗ(1), ㅣ추가, ·추가] 길이 15
    'ㄱ':[0.0, 0.0, 0.0, 1.0, 0.0, 0.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄲ':[0.0, 0.0, 0.0, 1.0, 0.0, 0.5,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄴ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄷ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄸ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.5,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄹ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 1.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅁ':[1.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅂ':[1.0, 0.0, 0.0, 0.0, 0.0, 0.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅂ':[1.0, 0.0, 0.0, 0.0, 0.0, 0.5,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅅ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅆ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.5,0.0,0.0,1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅇ':[0.0, 0.0, 0.0, 1.0, 0.0, 0.0,0.0,0.0,0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅈ':[0.0, 0.0, 1.0, 0.0, 0.0, 0.0,0.0,1.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅉ':[0.0, 0.0, 1.0, 0.0, 0.0, 0.5,0.0,1.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅊ':[0.0, 0.0, 1.0, 0.0, 0.0, 1.0,0.0,1.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅋ':[0.0, 0.0, 0.0, 1.0, 0.0, 1.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅌ':[0.0, 1.0, 0.0, 0.0, 0.0, 1.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅍ':[1.0, 0.0, 0.0, 0.0, 0.0, 1.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅎ':[0.0, 0.0, 0.0, 0.0, 1.0, 1.0,0.0,0.0,1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅏ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ,0.0],
    'ㅑ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ,1.0],
    'ㅐ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 1.0 ,0.0],
    'ㅒ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 1.0 ,1.0],
    'ㅓ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 0.0 ,0.0],
    'ㅕ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 0.0 ,1.0],
    'ㅔ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 1.0 ,0.0],
    'ㅖ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 1.0 ,1.0],
    'ㅗ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1.0, 0.0 ,0.0],
    'ㅛ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1.0, 0.0 ,1.0],
    'ㅜ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1/3, 0.0 ,0.0],
    'ㅠ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1/3, 0.0 ,1.0],
    'ㅡ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 2/3, 0.0 ,0.0],
    'ㅣ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 0.0, 0.0 ,0.0],
    'ㅘ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 1.0, 0.0 ,0.0],
    'ㅙ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 1.0, 1.0 ,0.0],
    'ㅚ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 1.0, 1.0 ,0.0],
    'ㅝ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 1/3, 0.0 ,0.0],
    'ㅞ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 1/3, 1.0 ,0.0],
    'ㅝ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 1/3, 0.0 ,0.0],
    'ㅢ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 2/3, 0.0 ,0.0],
}

jamodict[''] = np.mean(np.array(list(jamodict.values()),dtype=np.float64), axis=0).tolist() # 공백은 각 열의 평균으로 처리하여 데이터의 편향을 방지
print(jamodict[''])
'''

[0.07894736842105263, 0.18421052631578946, 0.07894736842105263, 0.10526315789473684, 0.02631578947368421, 0.19736842105263158, 0.21052631578947367, 0.07894736842105263, 0.07894736842105263, 0.07894736842105263, 0.02631578947368421, 0.27192982456140347, 0.2017543859649123, 0.18421052631578946, 0.15789473684210525]


## 겹받침을 포함한 경우

In [11]:
jamodict = {#[양순음,치조음,경구개음,연구개음,후음,예사소리(0)된소리(0.5)거센소리(1),파열음,파찰음,마찰음,비음,유음,ㅓ(0.33))ㅣ(0.67)ㅏ(1), ㅜ(0.33)ㅡ(0.67)ㅗ(1), ㅣ추가, ·추가] 길이 15
    'ㄱ':[0.0, 0.0, 0.0, 1.0, 0.0, 0.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄲ':[0.0, 0.0, 0.0, 1.0, 0.0, 0.5,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄴ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄷ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄸ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.5,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㄹ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 1.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅁ':[1.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅂ':[1.0, 0.0, 0.0, 0.0, 0.0, 0.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅂ':[1.0, 0.0, 0.0, 0.0, 0.0, 0.5,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅅ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅆ':[0.0, 1.0, 0.0, 0.0, 0.0, 0.5,0.0,0.0,1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅇ':[0.0, 0.0, 0.0, 1.0, 0.0, 0.0,0.0,0.0,0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅈ':[0.0, 0.0, 1.0, 0.0, 0.0, 0.0,0.0,1.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅉ':[0.0, 0.0, 1.0, 0.0, 0.0, 0.5,0.0,1.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅊ':[0.0, 0.0, 1.0, 0.0, 0.0, 1.0,0.0,1.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅋ':[0.0, 0.0, 0.0, 1.0, 0.0, 1.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅌ':[0.0, 1.0, 0.0, 0.0, 0.0, 1.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅍ':[1.0, 0.0, 0.0, 0.0, 0.0, 1.0,1.0,0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅎ':[0.0, 0.0, 0.0, 0.0, 1.0, 1.0,0.0,0.0,1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ,0.0],
    'ㅏ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ,0.0],
    'ㅑ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 0.0 ,1.0],
    'ㅐ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 1.0 ,0.0],
    'ㅒ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 0.0, 1.0 ,1.0],
    'ㅓ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 0.0 ,0.0],
    'ㅕ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 0.0 ,1.0],
    'ㅔ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 1.0 ,0.0],
    'ㅖ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 0.0, 1.0 ,1.0],
    'ㅗ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1.0, 0.0 ,0.0],
    'ㅛ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1.0, 0.0 ,1.0],
    'ㅜ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1/3, 0.0 ,0.0],
    'ㅠ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 1/3, 0.0 ,1.0],
    'ㅡ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 0.0, 2/3, 0.0 ,0.0],
    'ㅣ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 0.0, 0.0 ,0.0],
    'ㅘ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 1.0, 0.0 ,0.0],
    'ㅙ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1.0, 1.0, 1.0 ,0.0],
    'ㅚ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 1.0, 1.0 ,0.0],
    'ㅝ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 1/3, 0.0 ,0.0],
    'ㅞ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 1/3, 1/3, 1.0 ,0.0],
    'ㅝ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 1/3, 0.0 ,0.0],
    'ㅢ':[0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0, 0.0, 0.0, 2/3, 2/3, 0.0 ,0.0],
}

jamodict[''] = np.mean(np.array(list(jamodict.values()),dtype=np.float64), axis=0).tolist() # 공백은 각 열의 평균으로 처리하여 데이터의 편향을 방지
print(jamodict[''])

jamodict['ㄵ'] = list(np.add(jamodict['ㄴ'],jamodict['ㅈ'])/2)
jamodict['ㄺ'] = list(np.add(jamodict['ㄹ'],jamodict['ㄱ'])/2)
jamodict['ㅀ'] = list(np.add(jamodict['ㄹ'],jamodict['ㅎ'])/2)
jamodict['ㄻ'] = list(np.add(jamodict['ㄹ'],jamodict['ㅁ'])/2)
jamodict['ㄼ'] = list(np.add(jamodict['ㄹ'],jamodict['ㅂ'])/2)
jamodict['ㄾ'] = list(np.add(jamodict['ㄹ'],jamodict['ㅌ'])/2)
jamodict['ㄶ'] = list(np.add(jamodict['ㄴ'],jamodict['ㅎ'])/2)
jamodict['ㅄ'] = list(np.add(jamodict['ㅂ'],jamodict['ㅅ'])/2)
jamodict['ㄳ'] = list(np.add(jamodict['ㄱ'],jamodict['ㅅ'])/2)

[0.07894736842105263, 0.18421052631578946, 0.07894736842105263, 0.10526315789473684, 0.02631578947368421, 0.19736842105263158, 0.21052631578947367, 0.07894736842105263, 0.07894736842105263, 0.07894736842105263, 0.02631578947368421, 0.27192982456140347, 0.2017543859649123, 0.18421052631578946, 0.15789473684210525]


# 데이터 전처리

In [12]:
if not os.path.isfile('NIADic.csv'):
  rawdata = pd.read_excel('NIADic.xlsx') # 원본 데이터는 xlsx
  rawdata = rawdata.drop(['tag','category'],axis=1) # 불필요한 컬럼 제거
  rawdata.to_csv('NIADic.csv') # csv로 저장

rawdata = pd.read_csv('NIADic.csv')
print(len(rawdata)) # 전체 단어 수
exclude = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c' # 여기에 포함된 문자가 있는 단어는 제외
exclude = [char for char in exclude]
rawdata = rawdata[rawdata['term'].map(lambda x: not any([char in x for char in exclude]))]
print(len(rawdata)) # 제외하고 남은 단어 수
rawdata = rawdata['term'].map(lambda x: j2hcj(h2j(x)))
rawdata = rawdata[rawdata.map(lambda x: len(x) < LEN+1)]
print(len(rawdata)) # LEN보다 긴 자모음 길이를 제외하고 남은 단어 수
jamodata = rawdata = rawdata[rawdata.map(lambda x: all([char in jamodict for char in x]))]
jamodata = np.array(jamodata)
print(len(rawdata)) # jamodict에 존재하지 않는 자음, 모음이 포함된 단어를 제외하고 남은 단어 수
rawdata = rawdata.map(lambda x: [jamodict[char] for char in x])
rawdata = rawdata.map(lambda x: (x + [jamodict[''] for _ in range(LEN - len(x))]))
data = np.array(list(rawdata),dtype=np.float64)

928677
920726
917246
885003


In [None]:
data.shape

(885003, 18, 15)

In [13]:
valididx = np.random.choice(len(data), 20000, False) # validation 데이터는 2만개
validdata = data[valididx]
data = np.delete(data, valididx, 0)
jamodata = np.delete(jamodata,valididx, 0)

trainidx = np.random.choice(len(data), 100000, False) # train 데이터는 10만개
traindata = data[trainidx]
data = np.delete(data, trainidx, 0)
jamodata = np.delete(jamodata,trainidx, 0)

In [None]:
print(jamodata.shape, data.shape)

(765003,) (765003, 18, 15)


In [None]:
print(traindata.shape, validdata.shape)

(100000, 18, 15) (20000, 18, 15)


# 학습

In [None]:
def scheduler(epoch, lr):
  if epoch % 30: # 30 epoch 마다 LR 변화
    return lr
  else:
    return lr * tf.math.exp(-0.1)
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

net.model.fit(traindata,traindata,epochs=1000,verbose=1,batch_size=BAT,validation_data=(validdata,validdata),callbacks=[callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000

KeyboardInterrupt: ignored

# 결과 테스트

In [13]:
# log cosh가 가장 적은 자모음을 선택
def returnnearest(dic, target):
  arr = []
  for k, v in dic.items():
    #arr.append(np.dot(v, target)/(np.linalg.norm(v)*np.linalg.norm(target))) #cosine_similarity
    arr.append(np.mean(np.log(np.cosh(np.subtract(v,target))))) #log_cosh
  return list(dic.keys())[np.argmin(arr)]

# 50000~50099번째 데이터로 테스트

In [16]:
predicteddata = net.model.predict(data[800000:800100]).tolist()
for i in predicteddata:
  for j,k in enumerate(i):
    i[j] = returnnearest(jamodict,i[j])

## 인코딩된 벡터 표시

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
vectors = net.encoder.predict(data[50000:50010])
vectors

array([[ 1.09091248e+02, -2.12875481e+01, -6.77406693e+01,
        -1.82579666e+02,  1.41653763e+02, -7.00715866e+01,
         1.58648672e+01, -2.21967499e+02, -9.61407928e+01,
        -8.15706482e+01,  2.31148834e+02,  1.26993698e+02,
        -1.56641159e+02, -1.06779388e+02,  2.30158508e+02,
        -7.45728149e+01,  1.86986980e+01,  3.56227600e+02,
         3.30851860e+01,  1.12808693e+02, -7.53944492e+00,
         1.41281998e+02, -2.31783340e+02,  1.87513153e+02,
         1.37185379e+02,  2.63655701e+02, -5.35008606e+02,
         8.44048615e+01, -6.71052647e+00, -2.74159241e+01,
         5.55314453e+02,  2.28748505e+02, -2.23186661e+02,
        -3.02713959e+02,  1.10337669e+02, -2.26198761e+02,
         2.03014908e+02, -1.59353714e+02, -9.77824326e+01,
        -2.10758316e+02,  4.54281235e+01,  1.43453476e+02,
        -3.68187897e+02, -2.26850082e+02, -6.27441483e+01,
         2.17245941e+02, -4.56791687e+01, -2.84845734e+02,
        -2.19686356e+02, -2.86347260e+02,  2.22052673e+0

## 기존 데이터와 비교

In [17]:
for i,v in enumerate(predicteddata):
 print(join_jamos(''.join(v)), '|', join_jamos(jamodata[i +800000]))

남해해물나라 | 남해해물나라
남해화학 | 남해화학
남해환경 | 남해환경
남해활어센퉈 | 남해활어센타
남해횟집수서 | 남해횟집수산
남해갈치찜 | 남해갈치찜
남해굴해장거 | 남해굴해장국
남해낙지골 | 남해낙지골
남해더왕대게 | 남해더왕대게
남해돌수산 | 남해돌수산
남해썬비치 | 남해썬비치
남해안활어 | 남해안활어
남해양곱차어 | 남해양곱창
남해찜나라 | 남해찜나라
남해찜마을 | 남해찜마을
남해활어회 | 남해활어회
남해회초밥 | 남해회초밥
남향통다 | 남향통닭
남향레미콘 | 남향레미콘
남현무역 | 남현무역
남형상사 | 남형상사
남호섬유 | 남호섬유
남호전기 | 남호전기
남화개발 | 남화개발
남화산업 | 남화산업
남화토건 | 남화토건
남화통상 | 남화통상
남화해운 | 남화해운
남흥건설 | 남흥건설
남흥삼겹사 | 남흥삼겹살
남흥여객 | 남흥여객
남흥토건 | 남흥토건
남희공조산어 | 남희공조산업
남희청국장 | 남희청국장
납작만두 | 납작만두
낫소골프 | 낫소골프
낭가맥주캠풔 | 낭가맥주캠프
낭낭실비마치 | 낭낭실비마차
낭독의발견 | 낭독의발견
낭랑구이 | 낭랑구이
낭만곱창 | 낭만곱창
낭만구이 | 낭만구이
낭만다발 | 낭만닭발
낭만돼지 | 낭만돼지
낭만부대짜개 | 낭만부대찌개
낭만순두브 | 낭만순두부
낭만우동 | 낭만우동
낭만자객 | 낭만자객
낭만쭈꾸미 | 낭만쭈꾸미
낭만콘서트 | 낭만콘서트
낭만바베큐숶 | 낭만바베큐숲
낭만소주방 | 낭만소주방
낭만을브탁 | 낭만을부탁해
낭유리곱창 | 낭유리곱창
낭주골낙자춰 | 낭주골낙지촌
낮술 | 낮술
낮젠밥밤엔 | 낮젠밥밤엔술
낯선엡 | 낯선웹
낯선조류 | 낯선조류
내곁에잇어 | 내곁에있어
내곁에잇어지 | 내곁에있어줘
내고향스페셜 | 내고향스페셜
내남자의여자 | 내남자의여자
내눈에콩각저 | 내눈에콩깍지
내딸꽃님이 | 내딸꽃님이
내딸서영이 | 내딸서영이
내마음와풍구 | 내마음의풍금
내사랑 | 내사랑
내사랑내겨ㄸ에 | 내사랑내곁에
내사랑뚱 | 내사랑뚱
내사랑못난어 | 내사랑못난이
내사랑싸가지 | 내사랑싸가지
내사랑웬수 | 내사랑웬수
내사

# 모델 저장용

In [None]:
net.encoder.save("encoder.h5")
net.decoder.save("decoder.h5")
net.model.save("model.h5")



In [None]:
from google.colab import files
files.download('encoder.h5') 
files.download('decoder.h5') 
files.download('model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# 전체 데이터로 테스트(오래걸림)

In [18]:
count = 0
allcount = 0

for i in range(int(len(data)/10000)):
  try: 
    predicteddata = net.model.predict(data[i*10000:i * 10000 + 10000]).tolist()
    for idx, j in enumerate(predicteddata):
      for k,l in enumerate(j):
        j[k] = returnnearest(jamodict,j[k])
      #print(join_jamos(''.join(j)), join_jamos(jamodata[idx + i*10000]))
      allcount += 1
      if join_jamos(''.join(j)) == join_jamos(jamodata[idx + i*10000]):
        count += 1
    print(count / allcount)
  except ValueError as e:
    print(data[i*10000:i * 10000 + 10000])

print(count / allcount)

0.9394
0.9432
0.9454333333333333
0.948725
0.95096
0.95145
0.9511714285714286
0.9493375
0.9477222222222222
0.94703
0.9461727272727273
0.94555
0.946176923076923
0.9454857142857143
0.9446933333333334
0.94498125
0.9450470588235295
0.9453944444444444
0.9452052631578948
0.944755
0.9451714285714286
0.9456227272727272
0.9460608695652174
0.9455333333333333
0.945444
0.9458423076923077
0.9464407407407407
0.9467821428571429
0.9472551724137931
0.9472866666666667
0.9463258064516129
0.94544375
0.9453
0.9447588235294118
0.9451771428571428
0.9447416666666667
0.9446513513513514
0.945028947368421
0.9447846153846153
0.9447575
0.9453390243902439
0.945897619047619
0.9462069767441861
0.9465477272727273
0.9466088888888889
0.9465282608695652
0.9468446808510639
0.9470583333333333
0.9467510204081633
0.94689
0.9469901960784314
0.9469884615384615
0.9470603773584906
0.9473759259259259
0.9468418181818182
0.9462625
0.9447701754385965
0.9388362068965517
0.9316830508474576
0.9245116666666666
0.9201754098360656
0.917683