# Thai Named Entity Recognition Using Bi-LSTM + CRF
## The model use Bi-directional LSTM with Word / Character representation and CRF for sequece tagging

perform keras backend

- Word Embedding : 0.32 Thai2Fit 400 dimension
- Char Embedding : LSTM training from scratch 32 dimension

## Declare Path for Model and Dataset
Declare user path to load/save model and dataset
In this file include:
- raw path
- model path
- word embedding path
- dictionary path (char2index / ner2index)

In [1]:
DATA_PATH='./'
RAW_PATH = f'{DATA_PATH}raw/'
MODEL_PATH = f'{DATA_PATH}model/Keras/WordCharModel/'
W_MODEL_PATH = f'{DATA_PATH}model/thai2fit/'
Dict_MODEL_PATH = f'{DATA_PATH}model/dictionary/'

# Version

Keras 2.1.6 (pip install keras==2.1.6)

Python 3.5.2

In [2]:
import keras
print(keras.__version__)

Using TensorFlow backend.


2.1.6


# Import

In [3]:
# Save / Load File
import dill
import pickle

# Plot Graph
import matplotlib.pyplot as plt

# Sklearn Report
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

# Load Vectors
from gensim.models import KeyedVectors

# Utility
import numpy as np
import time

# Model Utility
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd

# Keras Model
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.utils import to_categorical
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
from keras_contrib.layers import CRF
from keras.callbacks import ModelCheckpoint

from pythainlp.tokenize import word_tokenize

# ELMO Model
from allennlp.modules.elmo import Elmo, batch_to_ids

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## Load Thai2Fit Word Embedding
load Binary file of thai2fit (0.32) train wikipedia using ULMFit model

the word vector size is 55677x400 dimensions

credit: https://github.com/cstorm125/thai2fit

In [4]:
thai2fit_model = KeyedVectors.load_word2vec_format(W_MODEL_PATH+'thai2vecNoSym.bin',binary=True)
thai2fit_weight = thai2fit_model.vectors

thai2dict = {}

for word in thai2fit_model.index2word:
    thai2dict[word] = thai2fit_model[word]

all_thai2dict = sorted(set(thai2dict))
thai2dict_to_ix = dict((c, i) for i, c in enumerate(thai2dict)) #convert thai2fit to index 
ix_to_thai2dict = dict((v,k) for k,v in thai2dict_to_ix.items())  #convert index to thai2fit

n_thai2dict = len(thai2dict_to_ix)

## Load NER Dictionary

In [5]:
with open(Dict_MODEL_PATH+'nerdict.pickle', 'rb') as nerdict:
    ner_to_ix = pickle.load(nerdict)

ix_to_ner = dict((v,k) for k,v in ner_to_ix.items())  #convert index to ner
n_tag = len(ner_to_ix)

## Load Character Dictionary

In [6]:
with open(Dict_MODEL_PATH+'chardict.pickle', 'rb') as chardict:
    char2idx = pickle.load(chardict)

n_chars = len(char2idx)

# Set Parameter and Hyper Parameter

In [7]:
max_len = 250
max_len_char = 30

character_LSTM_unit = 32
char_embedding_dim = 32
main_lstm_unit = 256 ## Bidirectional 256 + 256 = 512
lstm_recurrent_dropout = 0.5

train_batch_size = 32
train_epochs = 50

## Mapping Function 

In [8]:
def prepare_sequence_word(input_text):
    idxs = list()
    for word in input_text:
        if word in thai2dict:
            idxs.append(thai2dict_to_ix[word])
        else:
            idxs.append(thai2dict_to_ix["unknown"]) #Use UNK tag for unknown word
    return idxs

def prepare_sequence_target(input_label):
    idxs = [ner_to_ix[w] for w in input_label]
    return idxs

# Initial Keras Model

In [9]:
# Word Input
word_in = Input(shape=(max_len,), name='word_input_')

# Word Embedding Using Thai2Fit
word_embeddings = Embedding(input_dim=n_thai2dict,
                            output_dim=400,
                            weights = [thai2fit_weight],input_length=max_len,
                            mask_zero=False,
                            name='word_embedding', trainable=False)(word_in)

# Character Input
char_in = Input(shape=(max_len, max_len_char,), name='char_input')

# Character Embedding
emb_char = TimeDistributed(Embedding(input_dim=n_chars, output_dim=char_embedding_dim, 
                           input_length=max_len_char, mask_zero=False))(char_in)

# Character Sequence to Vector via BiLSTM
char_enc = TimeDistributed(Bidirectional(LSTM(units=character_LSTM_unit, return_sequences=False, recurrent_dropout=lstm_recurrent_dropout)))(emb_char)


# Concatenate All Embedding
all_word_embeddings = concatenate([word_embeddings, char_enc])
all_word_embeddings = SpatialDropout1D(0.3)(all_word_embeddings)

# Main Model BiLSTM
main_lstm = Bidirectional(LSTM(units=main_lstm_unit, return_sequences=True,
                               recurrent_dropout=lstm_recurrent_dropout))(all_word_embeddings)
main_lstm = TimeDistributed(Dense(50, activation="relu"))(main_lstm)

# CRF
crf = CRF(n_tag)  # CRF layer
out = crf(main_lstm)  # output

# Model
model = Model([word_in, char_in], out)

model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         (None, 250, 30)      0                                            
__________________________________________________________________________________________________
word_input_ (InputLayer)        (None, 250)          0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 250, 30, 32)  12768       char_input[0][0]                 
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 250, 400)     22270800    word_input_[0][0]                
__________________________________________________________________________________________________
time_distr

## Load Weight Model

In [10]:
load_filepath=MODEL_PATH+"weights-improvement-46-0.996.hdf5"
model.load_weights(load_filepath)

# Prepare Sentence

In [11]:
def convert_word_to_char(predict_word):
    predict_char = []
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):    
            try:
                if(predict_word[i][j] in char2idx):
                    word_seq.append(char2idx.get(predict_word[i][j]))
                else:
                    word_seq.append(char2idx.get("unknown"))
            except:
                word_seq.append(char2idx.get("pad"))
        sent_seq.append(word_seq)
    predict_char.append(np.array(sent_seq))
    
    return predict_char

# Prepare Input for prediction

tokenization Thai word -> generate word list + padding -> generate char list + padding

In [12]:
text = "นายธนาธรเจอนางสาวยิ่งลักษ์ที่มหาวิทยาลัยจุฬา เช้าวันนี้"

predict_sent = word_tokenize(text,engine='newmm')
len_word = len(predict_sent)

predict_word = []
predict_word = [prepare_sequence_word(predict_sent)]
predict_word = pad_sequences(maxlen=max_len, sequences=predict_word, value=thai2dict_to_ix["pad"], padding='post', truncating='post')

predict_char = convert_word_to_char(predict_sent)

# Prediction

In [13]:
result_tag = model.predict([predict_word,np.array(predict_char).reshape((len(predict_char),max_len, max_len_char))])
p = np.argmax(result_tag, axis=-1)
pred=[i for i in p[0]]
revert_pred=[ix_to_ner[i] for i in p[0]]

In [14]:
print(predict_sent)
print(pred[:len_word])
print(revert_pred[:len_word])

['นาย', 'ธนา', 'ธร', 'เจอ', 'นางสาว', 'ยิ่ง', 'ลัก', 'ษ์', 'ที่', 'มหาวิทยาลัย', 'จุฬา', ' ', 'เช้า', 'วันนี้']
[8, 21, 21, 25, 8, 21, 21, 21, 25, 6, 19, 25, 10, 0]
['B-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'B-PERSON', 'I-PERSON', 'I-PERSON', 'I-PERSON', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'O', 'B-TIME', 'B-DATE']
