[View in Colaboratory](https://colab.research.google.com/github/TTaEE/Thai_NLP/blob/master/word_segmentation.ipynb)

In [0]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

local_download_path = os.path.expanduser('~/BEST2010/')
try:
  os.makedirs(local_download_path)
except: pass

best2010 = drive.ListFile({'q': "'1GxDYvKHO6LZjh5w_npqAfDdkx-s7IH-E' in parents"}).GetList()
for article in best2010:
  article_path = os.path.join(local_download_path,article['title'])
  try:
    os.makedirs(article_path)
  except: pass
  file_list = drive.ListFile({'q': "'{}' in parents".format(article['id'])}).GetList()
  for file in file_list:
    file_path = os.path.join(article_path, file['title'])
    f_ = drive.CreateFile({'id': file['id']})
    f_.GetContentFile(file_path)

In [2]:
import numpy as np 
import pandas as pd 

import os
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from keras.models import Model
from keras.layers import Input, Dense, Embedding, \
                         Flatten, Dropout, \
                         GRU, Bidirectional
from keras.optimizers import Adam

Using TensorFlow backend.


In [0]:
CHARS_LIST = [
    u'\n', u' ', u'!', u'"', u'#', u'$', u'%', u'&', "'", u'(', u')', u'*', u'+',
    u',', u'-', u'.', u'/', u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8',
    u'9', u':', u';', u'<', u'=', u'>', u'?', u'@', u'A', u'B', u'C', u'D', u'E',
    u'F', u'G', u'H', u'I', u'J', u'K', u'L', u'M', u'N', u'O', u'P', u'Q', u'R',
    u'S', u'T', u'U', u'V', u'W', u'X', u'Y', u'Z', u'[', u'\\', u']', u'^', u'_',
    u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm',
    u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', u'x', u'y',
    u'z', u'}', u'~', u'ก', u'ข', u'ฃ', u'ค', u'ฅ', u'ฆ', u'ง', u'จ', u'ฉ', u'ช',
    u'ซ', u'ฌ', u'ญ', u'ฎ', u'ฏ', u'ฐ', u'ฑ', u'ฒ', u'ณ', u'ด', u'ต', u'ถ', u'ท',
    u'ธ', u'น', u'บ', u'ป', u'ผ', u'ฝ', u'พ', u'ฟ', u'ภ', u'ม', u'ย', u'ร', u'ฤ',
    u'ล', u'ว', u'ศ', u'ษ', u'ส', u'ห', u'ฬ', u'อ', u'ฮ', u'ฯ', u'ะ', u'ั', u'า',
    u'ำ', u'ิ', u'ี', u'ึ', u'ื', u'ุ', u'ู', u'ฺ', u'เ', u'แ', u'โ', u'ใ', u'ไ',
    u'ๅ', u'ๆ', u'็', u'่', u'้', u'๊', u'๋', u'์', u'ํ', u'๐', u'๑', u'๒', u'๓',
    u'๔', u'๕', u'๖', u'๗', u'๘', u'๙', u'‘', u'’', u'\ufeff', u'unk'
]

CHARS_MAP = {v: k for k, v in enumerate(CHARS_LIST)}


In [0]:
best_train_set = "BEST2010"

In [0]:
def build_target_feature(text, n_pad=21):
    pad = int((n_pad-1)/2)
    text = re.sub('<[^>]*>', '',text)
    text = [' ']*pad + list(text) + [' ']*pad
    char = list()
    tar = list()
    for ch in text:
        char.append(ch)
        if ch == '|':
            char.pop()
            tar.pop()
            tar.append(True)
        else:
            tar.append(False)     
    return pd.DataFrame(data = {'n':char,'f':tar})
    

def build_n_gram_df(df,n_pad):
    ng = df
    pad = int((n_pad-1)/2)
    for i in range(pad):
        ng['n+{0}'.format(i+1)] = ng['n'] .shift(-i-1)
        ng['n{0}'.format(-i-1)] = ng['n'] .shift(i+1)
    
    ng = ng[['n-{0}'.format(i+1) for i in range(pad)] + ['n'] + ['n+{0}'.format(i+1) for i in range(pad)] + ['f']]
    return ng[pad:-pad]
  
def evaluate(x_test, y_test ,model):

    y_predict = model.predict([x_test])
    y_predict = (y_predict.ravel() > 0.5).astype(int)

    f1score = f1_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)

    return f1score, precision, recall

In [0]:
def bi_rnn_model(n_gram, v_size, em_size, hidden_size, dropout):
    _input = Input(shape=(n_gram,))
    embedding_vectors = Embedding(v_size, em_size)(_input)
    embedding = Dropout(0.2)(embedding_vectors)
    gru_cell = GRU(hidden_size, recurrent_dropout=dropout, dropout=dropout, return_sequences=True)
    hidden_layer1 = Bidirectional(gru_cell)(embedding)
    gru_cell = GRU(hidden_size, recurrent_dropout=dropout, dropout=dropout, return_sequences=True)
    hidden_layer2 = Bidirectional(gru_cell)(hidden_layer1)
    x = Flatten()(hidden_layer2)
    dense = Dense(em_size, activation='relu')(x)
    sigmoid = Dense(1, activation='sigmoid')(dense)
    model = Model(inputs=_input, outputs=sigmoid)
    model.compile(optimizer=Adam(),
                loss='binary_crossentropy',
                metrics=['acc'])
    return model

In [8]:
articles = ['novel','news','article','encyclopedia']
ngram = 21
v_size = 178
em_size = 128
hidden_size = 200
droupout = 0.5

model = bi_rnn_model(ngram, v_size, em_size, hidden_size, droupout)

for article in articles:
    
    _path = os.path.join(best_train_set,article)
    _files = os.listdir(_path)
    print('train with {0}'.format(article))
    
    # load text
    train_data = ''
    test_data = ''
    
    train_files, test_files = train_test_split(_files, test_size = 0.1, random_state=None)
    for _file in train_files:
        with open(os.path.join(_path,_file)) as file:
            train_data += file.read()
    for _file in test_files:
        with open(os.path.join(_path,_file)) as file:
            test_data += file.read()
    
    # prepare data
    train_df = build_target_feature(train_data)
    test_df = build_target_feature(test_data)
    
    train_df['n'] = train_df['n'].map(lambda ch: CHARS_MAP.get(ch,178))
    test_df['n'] = test_df['n'].map(lambda ch: CHARS_MAP.get(ch,178))
    
    pre_train_df = build_n_gram_df(train_df, ngram)
    pre_test_df = build_n_gram_df(test_df, ngram)
    
    x_train = pre_train_df.drop(['f'], axis=1).as_matrix()
    x_test = pre_test_df.drop(['f'], axis=1).as_matrix()
    y_train = pre_train_df['f'].as_matrix()
    y_test = pre_test_df['f'].as_matrix()
    
    # train model
    model.fit(x_train,y_train,batch_size=4096,epochs=3)
    # test model
    y_predict = model.predict([x_test])
    y_predict = (y_predict.ravel() > 0.5).astype(int)

    f1score, precision, recall = evaluate(x_test, y_test ,model)
    
    print ('score - \n F1 : {0} \n Precision : {1} \n Recall : {2}'.format(f1score, precision, recall))

train with novel
Epoch 1/3

Epoch 2/3

Epoch 3/3
 360448/5116411 [=>............................] - ETA: 19:09 - loss: 0.0483 - acc: 0.9823



score - 
 F1 : 0.9744298548721493 
 Precision : 0.9658050547306237 
 Recall : 0.9832100848339717
train with news
Epoch 1/3

Epoch 2/3
 294912/5839801 [>.............................] - ETA: 22:24 - loss: 0.0388 - acc: 0.9858



Epoch 3/3

score - 
 F1 : 0.9792475862834314 
 Precision : 0.9763839271600645 
 Recall : 0.9821280926031928
train with article
Epoch 1/3

Epoch 2/3
 700416/4051671 [====>.........................] - ETA: 13:29 - loss: 0.0268 - acc: 0.9907

Epoch 3/3
 557056/4051671 [===>..........................] - ETA: 14:04 - loss: 0.0226 - acc: 0.9923

score - 
 F1 : 0.9843264614515672 
 Precision : 0.9802728058016432 
 Recall : 0.9884137819279305
train with encyclopedia
Epoch 1/3
 299008/3955291 [=>............................] - ETA: 14:45 - loss: 0.0502 - acc: 0.9825

Epoch 2/3
 417792/3955291 [==>...........................] - ETA: 14:15 - loss: 0.0298 - acc: 0.9898

Epoch 3/3
 471040/3955291 [==>...........................] - ETA: 14:01 - loss: 0.0262 - acc: 0.9910

score - 
 F1 : 0.9735175216723956 
 Precision : 0.9661528955408328 
 Recall : 0.9809952858549508
