In [2]:
import numpy as np
from keras_contrib.layers import CRF
from sklearn.model_selection import ShuffleSplit
from data_utils import ENTITIES, Documents, Dataset, SentenceExtractor, make_predictions
from data_utils import Evaluator
from models import build_lstm_crf_model
from gensim.models import Word2Vec
from keras.models import load_model
from keras_contrib.layers.crf import CRF,crf_loss,crf_viterbi_accuracy

In [3]:
data_dir = 'data/Civil'
ent2idx = dict(zip(ENTITIES,range(1,len(ENTITIES) + 1)))
idx2ent = dict([(v, k) for k, v in ent2idx.items()])

In [4]:
docs = Documents(data_dir=data_dir)
rs = ShuffleSplit(n_splits=1, test_size=4, random_state=2018)
train_doc_ids, test_doc_ids = next(rs.split(docs))
train_docs, test_docs = docs[train_doc_ids], docs[test_doc_ids]

In [31]:
train_docs[1]

<data_utils.data_utils.Document at 0x1d39b61aa48>

In [18]:
num_cates = max(ent2idx.values()) + 1
sent_len = 60
vocab_size = 3000
emb_size = 100
sent_pad = 10
sent_extrator = SentenceExtractor(window_size=sent_len, pad_size=sent_pad)
train_sents = sent_extrator(train_docs)
test_sents = sent_extrator(test_docs)

train_data = Dataset(train_sents, cate2idx=ent2idx)
train_data.build_vocab_dict(vocab_size=vocab_size)

test_data = Dataset(test_sents, word2idx=train_data.word2idx, cate2idx=ent2idx)
vocab_size = len(train_data.word2idx)

In [12]:
w2v_train_sents = []
for doc in docs:
    w2v_train_sents.append(list(doc.text))
w2v_model = Word2Vec(w2v_train_sents, size=emb_size)

w2v_embeddings = np.zeros((vocab_size, emb_size))
for char, char_idx in train_data.word2idx.items():
    if char in w2v_model.wv:
        w2v_embeddings[char_idx] = w2v_model.wv[char]

In [34]:
seq_len = sent_len + 2 * sent_pad
model = build_lstm_crf_model(num_cates, seq_len=seq_len, vocab_size=vocab_size, 
                             model_opts={'emb_matrix': w2v_embeddings, 'emb_size': 100, 'emb_trainable': False})
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 80, 100)           74900     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 80, 512)           731136    
_________________________________________________________________
crf_2 (CRF)                  (None, 80, 146)           96506     
Total params: 902,542
Trainable params: 827,642
Non-trainable params: 74,900
_________________________________________________________________


In [35]:
train_X, train_y = train_data[:]
print('train_X.shape', train_X.shape)
print('train_y.shape', train_y.shape)

train_X.shape (386, 80)
train_y.shape (386, 80, 1)


In [36]:
model.fit(train_X, train_y, batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d39d79a748>

In [37]:
model.save('test/model.h5')

In [42]:
my_model = load_model('test/model.h5',custom_objects={'CRF':CRF,'crf_loss':crf_loss,'crf_viterbi_accuracy':crf_viterbi_accuracy})

In [43]:
test_X, _ = test_data[:]
preds = my_model.predict(test_X, batch_size=64, verbose=True)
pred_docs = make_predictions(preds, test_data, sent_pad, docs, idx2ent)



In [44]:
f_score, precision, recall = Evaluator.f1_score(test_docs, pred_docs)
print('f_score: ', f_score)
print('precision: ', precision)
print('recall: ', recall)

f_score:  0.3901639344262295
precision:  0.4605263157894737
recall:  0.338452787258248


In [45]:
sample_doc_id = list(pred_docs.keys())[0]
test_docs[sample_doc_id]

<data_utils.data_utils.Document at 0x1d39bb6cf08>

In [46]:
pred_docs[sample_doc_id]

<data_utils.data_utils.Document at 0x1d3a88a85c8>