### 0. Import module required|

In [1]:
import re
import pickle
from collections import Counter
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Dense, Input, Flatten, Embedding, Concatenate, Conv1D, BatchNormalization, TimeDistributed, GRU, Reshape
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from model import CNNBasedRNN
from model import RNN
from keras.utils import to_categorical

Using TensorFlow backend.


### 1. Data preprocessing and build word2idx 

In [2]:
MAX_SEQUENCE_LENGTH=200
def hangulExtractor(str):
    hangul = re.compile('[^ !?.$ㄱ-ㅎㅣ가-힣|a-z|A-Z]+') # 한글과 영어 띄어쓰기 중요 문장부호를 제외한 모든 글자
    # hangul = re.compile('[^ \u3131-\u3163\uac00-\ud7a3]+')  
    result = hangul.sub('', str)
    return result
rawfile = 'raw_spacing_corpus.txt'

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

def generate_word2idx():
    tmp_corpus=[]
    word2idx={}
    with open(rawfile,'r') as fp:
        for line in fp:
            line=hangulExtractor(line)
            tmp_corpus.extend(list(line.rstrip().replace(' ','')))
        
    for idx, (char, freq) in enumerate(Counter(tmp_corpus).most_common(),1):
        word2idx[char]=idx
    
    def save_obj(obj, word2idx_file):
        with open(word2idx_file,'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
            
    save_obj(word2idx,'word2idx.pkl')
    
    return word2idx

word2idx = generate_word2idx()
EMBEDDING_DIM = 100
DIC_SIZE = len(word2idx)
embedding_matrix = np.random.random((len(word2idx)+1, EMBEDDING_DIM))

### 3. Data loading and and convert

In [3]:
def raw_corpus():
    Filename = 'raw_spacing_corpus.txt'
    raw_data=[]
    with open(Filename,'r') as f:
        for line in f:
            raw_data.append(hangulExtractor(line.rstrip()))
    return raw_data

def generate_trainset(raw_data):
    """ prepare data set for training 
    X: I have an apple  Y: 122212122221
    Padding X and Y
    X: 00000000Ihaveanapple and Y:00000000122212122221
    """
    
    X,Y=[],[]
    for sent in raw_data:
        sent=sent.replace(' ','^')# '^' refer to the space
        segmented = list(sent)
        
        if len(segmented)<=MAX_SEQUENCE_LENGTH:
            tmp_x, tmp_y = [],[]
            tmp_length=len(segmented)
            for idx in range(tmp_length):
            
                if idx < tmp_length-1:
                    if segmented[idx] != '^':
                        tmp_x.append(word2idx[segmented[idx]])
                    
                        if idx+1 < tmp_length:
                            if segmented[idx+1] == '^':
                                tmp_y.append(1) 
                            else:
                                tmp_y.append(0)
                        else:
                            tmp_y.append(0)
                elif idx == tmp_length-1:
                    tmp_x.append(word2idx[segmented[idx]])
                    tmp_y.append(1)
            Y.append(tmp_y)
            X.append(tmp_x)
            tmp_x, tmp_x = [],[]        
    return X,Y

In [4]:
raw_data = raw_corpus()
X,Y = generate_trainset(raw_data)

In [5]:
X_train = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH )
Y_train = pad_sequences(Y, maxlen=MAX_SEQUENCE_LENGTH)
Y_train = to_categorical(Y_train, num_classes=3)

### Model Construction 

In [6]:
model = RNN(MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH, DIC_SIZE=DIC_SIZE, EMBEDDING_DIM=EMBEDDING_DIM)

  model = Model(inputs=seq_input, output=b)


In [7]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [8]:
model.compile(optimizer = 'adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, Y_train, batch_size=1024, epochs=10)
##TODO make padded dataset for both  X,Y  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdcbb8ff5f8>

In [10]:
model.save('spacing_model.h5')

In [11]:
!ls -lha 

합계 17M
drwxrwxr-x 4 sghan sghan 4.0K 10월 26 01:07 .
drwxrwxr-x 6 sghan sghan 4.0K 10월 24 18:27 ..
drwxrwxr-x 2 sghan sghan 4.0K 10월 25 22:54 .ipynb_checkpoints
-rw-r--r-- 1 sghan sghan  12K 10월 26 00:58 .model.py.swp
-rw-rw-r-- 1 sghan sghan 2.0K 10월 24 18:43 Untitled.ipynb
-rw-rw-r-- 1 sghan sghan 1.7K 10월 26 00:58 Untitled1.ipynb
drwxrwxr-x 2 sghan sghan 4.0K 10월 26 00:58 __pycache__
-rw-rw-r-- 1 sghan sghan 2.0K 10월 26 00:58 model.py
-rw-rw-r-- 1 sghan sghan  30K 10월 26 00:58 model_plot.png
-rw-rw-r-- 1 sghan sghan  13M 10월 24 18:27 raw_spacing_corpus.txt
-rw-rw-r-- 1 sghan sghan 2.8K 10월 24 18:27 sample.txt
-rw-rw-r-- 1 sghan sghan 8.1K 10월 26 01:06 spacing.ipynb
-rw-rw-r-- 1 sghan sghan 4.2M 10월 26 01:07 spacing_model.h5
-rw-rw-r-- 1 sghan sghan  17K 10월 24 18:27 word2idx
-rw-rw-r-- 1 sghan sghan  17K 10월 26 00:58 word2idx.pkl
