### 0. Import module required|

In [1]:
import re
import pickle
from collections import Counter
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Flatten, Embedding, Concatenate, Conv1D, BatchNormalization, TimeDistributed, GRU, Reshape
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


### 1. Data preprocessing and build word2idx 

In [2]:
MAX_SEQUENCE_LENGTH=200
def hangulExtractor(str):
    hangul = re.compile('[^ !?.$ㄱ-ㅎㅣ가-힣|a-z|A-Z]+') # 한글과 영어 띄어쓰기 중요 문장부호를 제외한 모든 글자
    # hangul = re.compile('[^ \u3131-\u3163\uac00-\ud7a3]+')  
    result = hangul.sub('', str)
    return result
rawfile = 'raw_spacing_corpus.txt'

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

def generate_word2idx():
    tmp_corpus=[]
    word2idx={}
    with open(rawfile,'r') as fp:
        for line in fp:
            line=hangulExtractor(line)
            tmp_corpus.extend(list(line.rstrip().replace(' ','')))
        
    for idx, (char, freq) in enumerate(Counter(tmp_corpus).most_common(),1):
        word2idx[char]=idx
    
    def save_obj(obj, word2idx_file):
        with open(word2idx_file,'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
            
    save_obj(word2idx,'word2idx.pkl')
    
    return word2idx

word2idx = generate_word2idx()
EMBEDDING_DIM = 100
DIC_SIZE = len(word2idx)
embedding_matrix = np.random.random((len(word2idx)+1, EMBEDDING_DIM))

### 3. Data loading and and convert

In [3]:
def raw_corpus():
    Filename = 'raw_spacing_corpus.txt'
    raw_data=[]
    with open(Filename,'r') as f:
        for line in f:
            raw_data.append(hangulExtractor(line.rstrip()))
    return raw_data

def generate_trainset(raw_data):
    X,Y=[],[]
    for sent in raw_data:
        sent=sent.replace(' ','^')
        segmented = list(sent)
        
        if len(segmented)<=MAX_SEQUENCE_LENGTH:
            tmp_x, tmp_y = [],[]
            tmp_length=len(segmented)
            for idx in range(tmp_length):
            
                if idx < tmp_length-1:
                    if segmented[idx] != '^':
                        tmp_x.append(word2idx[segmented[idx]])
                    
                        if idx+1 < tmp_length:
                            if segmented[idx+1] == '^':
                                tmp_y.append(1)
                            else:
                                tmp_y.append(0)
                        else:
                            tmp_y.append(0)
                elif idx == tmp_length-1:
                    tmp_x.append(word2idx[segmented[idx]])
                    tmp_y.append(1)
            Y.append(tmp_y)
            X.append(tmp_x)
            tmp_x, tmp_x = [],[]        
    return X,Y

In [4]:
raw_data = raw_corpus()
X,Y = generate_trainset(raw_data)

In [5]:
X_train = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

### Model Construction 

In [6]:
def build_model():
    
    seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedded_input = Embedding(DIC_SIZE, EMBEDDING_DIM, input_length = MAX_SEQUENCE_LENGTH)(seq_input)

    convolution1= Conv1D(kernel_size=1, filters=32, padding = 'same')(embedded_input)
    convolution2= Conv1D(kernel_size=2, filters=64, padding = 'same')(embedded_input)
    convolution3= Conv1D(kernel_size=3, filters=128, padding = 'same')(embedded_input)
    convolution4= Conv1D(kernel_size=4, filters=256, padding = 'same')(embedded_input)
    
    concatenate = Concatenate(axis=2)([convolution1,convolution2,convolution3,convolution4])
    batchnormalization = BatchNormalization()(concatenate)
    timedistributed1 = TimeDistributed(Dense(300))(batchnormalization)
    timedistributed2 = TimeDistributed(Dense(150))(timedistributed1)
    gru = GRU(units=50, return_sequences=True)(timedistributed2)
    timedistributed3 = TimeDistributed(Dense(1))(gru)
    b = Reshape((200,))(timedistributed3)
    
    model = Model(inputs=seq_input, output=b)
    return model 

In [7]:
model= build_model()



In [8]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 100)     194700      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 200, 32)      3232        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 200, 64)      12864       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (