In [121]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import tensorflow_addons as tfa

from math import nan
from tensorflow.keras.callbacks import ModelCheckpoint


In [122]:
dataset = pd.read_csv("./data/ner.csv")

In [123]:
dataset = dataset.dropna(axis=0)

In [124]:
class SentenceGetter(object):

    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        def agg_func(s): return [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [125]:
getter = SentenceGetter(dataset)

In [126]:
sentences = getter.sentences

print(sentences[1])


[('Mayroon', 'O'), ('ding', 'O'), ('pagsikip', 'B-SYMPTOM'), ('sa', 'I-SYMPTOM'), ('aking', 'O'), ('dibdib', 'I-SYMPTOM'), ('at', 'O'), ('nakakaranas', 'O'), ('din', 'O'), ('ng', 'O'), ('pag-ubo', 'B-SYMPTOM'), ('.', 'O')]


In [127]:
maxlen = max([len(s) for s in sentences])
print('Maximum sequence length:', maxlen)


Maximum sequence length: 19


In [128]:
tags = []
for tag in set(dataset["tag"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('UNKNOWN')
    else:
        tags.append(tag)
print(tags)


['I-SYMPTOM', 'I', 'B-SYMPTOM', 'O']


In [129]:
n_tags = len(tags)
n_tags


4

In [130]:
words = list(set(dataset["word"].values))
words.append("END")
words.append("UNKNOWN")

n_words = len(words)
n_words

190

In [131]:
from future.utils import iteritems
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}


In [132]:
import json

ner_config = {
    "n_tags": n_tags,
    "n_words": n_words,
    "maxlen": maxlen
}

folder_name = 'cfg'

with open("{}/{}.json".format(folder_name, "word_list"), "w") as file_path:
    json.dump(word2idx, file_path)

with open("{}/{}.json".format(folder_name, "ner_config"), "w") as file_path:
    json.dump(ner_config, file_path)

with open("{}/{}.json".format(folder_name, "tags"), "w") as file_path:
    json.dump(tags, file_path)


In [133]:
from keras.utils.data_utils import pad_sequences
x = [[word2idx[w[0]] for w in s] for s in sentences]

In [134]:
x = pad_sequences(maxlen=maxlen, sequences=x, padding="post", value=n_words - 1)

In [135]:
y_idx = [[tag2idx[w[1]] for w in s] for s in sentences]

In [136]:
y = pad_sequences(maxlen=maxlen, sequences=y_idx, padding="post", value=tag2idx["O"])

In [137]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]


In [138]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)

In [139]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, TimeDistributed, Bidirectional, Activation
from tensorflow.keras import Sequential


In [140]:
#x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
#y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
#x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)
#y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)


In [141]:
from tensorflow_addons.layers.crf import CRF
#from keras_crf import CRFModel

MODEL_TYPE = 'GRU'

word_embedding_size = 300

input = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
model = Sequential()
model.add(Embedding(input_dim=n_words,
                    output_dim=word_embedding_size, input_length=maxlen))

if (MODEL_TYPE == 'LSTM'):
    model.add(Bidirectional(LSTM(units=word_embedding_size,
            return_sequences=True,
            dropout=0.5,
            recurrent_dropout=0.5,
            kernel_initializer=tf.keras.initializers.he_normal())))
    model.add(LSTM(units=word_embedding_size * 2,
            return_sequences=True,
            dropout=0.5,
            recurrent_dropout=0.5,
            kernel_initializer=tf.keras.initializers.he_normal()))
else:
    model.add(Bidirectional(GRU(units=word_embedding_size,
            return_sequences=True,
            dropout=0.5,
            recurrent_dropout=0.5,
            kernel_initializer=tf.keras.initializers.he_normal())))
    model.add(GRU(units=word_embedding_size * 2,
            return_sequences=True,
            dropout=0.5,
            recurrent_dropout=0.5,
            kernel_initializer=tf.keras.initializers.he_normal()))
model.add(TimeDistributed(Dense(n_tags)))
model.add(Activation('softmax'))
#crf = CRF(n_tags + 1)
#model.add(crf)
# embedding = Embedding(input_dim=n_words,
#                   output_dim=word_embedding_size, input_length=maxlen)(input)
# bi_lstm = Bidirectional(LSTM(units=word_embedding_size,
#                            return_sequences=True,
#                            dropout=0.5,
#                            recurrent_dropout=0.5,
#                              kernel_initializer=tf.keras.initializers.he_normal()))(embedding)
# lstm = LSTM(units=word_embedding_size * 2,
#                             return_sequences=True,
#                             dropout=0.5,
#                             recurrent_dropout=0.5,
#                             kernel_initializer=tf.keras.initializers.he_normal())(bi_lstm)
# kernel = TimeDistributed(Dense(n_tags, activation="relu"))(lstm)
# crf = CRF(n_tags + 1)

#outputs = crf(kernel)

# model = Model(inputs=input, outputs=kernel)
# model.add_loss(tf.abs(tf.reduce_mean(kernel)))

#base = Model(inputs=input, outputs=kernel)
#model = CRFModel(base, n_tags)


In [142]:
adam = tf.keras.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)
model.compile(
    optimizer=adam,
    metrics=['acc'],
    loss="categorical_crossentropy"
)

In [143]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 19, 300)           57000     
                                                                 
 bidirectional_5 (Bidirectio  (None, 19, 600)          1083600   
 nal)                                                            
                                                                 
 gru_5 (GRU)                 (None, 19, 600)           2163600   
                                                                 
 time_distributed_5 (TimeDis  (None, 19, 4)            2404      
 tributed)                                                       
                                                                 
 activation_5 (Activation)   (None, 19, 4)             0         
                                                                 
Total params: 3,306,604
Trainable params: 3,306,604
No

In [144]:
# Saving the best only
if (MODEL_TYPE == 'LSTM'):
    filepath = "bilstm.h5"
else:
    filepath = "bigru.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [145]:
model_history = model.fit(x_train, np.array(y_train), batch_size=256, epochs=200,
                    validation_split=0.1, verbose=1, callbacks=callbacks_list)


Epoch 1/200
Epoch 1: val_acc improved from -inf to 0.80702, saving model to bigru.h5
Epoch 2/200
Epoch 2: val_acc improved from 0.80702 to 0.84211, saving model to bigru.h5
Epoch 3/200
Epoch 3: val_acc did not improve from 0.84211
Epoch 4/200
Epoch 4: val_acc did not improve from 0.84211
Epoch 5/200
Epoch 5: val_acc did not improve from 0.84211
Epoch 6/200
Epoch 6: val_acc did not improve from 0.84211
Epoch 7/200
Epoch 7: val_acc did not improve from 0.84211
Epoch 8/200
Epoch 8: val_acc did not improve from 0.84211
Epoch 9/200
Epoch 9: val_acc did not improve from 0.84211
Epoch 10/200
Epoch 10: val_acc did not improve from 0.84211
Epoch 11/200
Epoch 11: val_acc did not improve from 0.84211
Epoch 12/200
Epoch 12: val_acc did not improve from 0.84211
Epoch 13/200
Epoch 13: val_acc did not improve from 0.84211
Epoch 14/200
Epoch 14: val_acc did not improve from 0.84211
Epoch 15/200
Epoch 15: val_acc did not improve from 0.84211
Epoch 16/200
Epoch 16: val_acc did not improve from 0.84211
E

In [146]:
# model.save('bilstmcrf.h5')

In [147]:
x_test[1]


array([141,  64, 164,  35,  52, 139,  34, 189, 189, 189, 189, 189, 189,
       189, 189, 189, 189, 189, 189])

In [148]:
i = 3
p = model.predict(np.array([x_test[i]]))
print(p)
p = np.argmax(p, axis=-1)
gt = np.argmax(y_test[i], axis=-1)
print(gt)
print("{:15} {:15} {}".format("Word", "Tag", "Predicted"))
for idx, (w,pred) in enumerate(zip(x_test[i],p[0])):
    print("{:15} {:15} {}".format(words[w],idx2tag[gt[idx]],tags[pred]))

[[[1.09865526e-02 2.01196573e-03 9.29741740e-01 5.72597906e-02]
  [1.35992721e-01 1.26652303e-04 7.95209408e-02 7.84359634e-01]
  [1.79716170e-01 7.69876078e-06 1.48920089e-01 6.71356082e-01]
  [2.03707433e-08 1.42172052e-10 6.26801295e-07 9.99999404e-01]
  [1.11219242e-05 1.98743577e-09 9.99963880e-01 2.50016474e-05]
  [9.94894922e-01 1.29859060e-07 1.34750037e-04 4.97018034e-03]
  [9.99513507e-01 1.79517681e-08 1.29280306e-04 3.57214536e-04]
  [9.94127274e-01 3.00112504e-08 4.06301479e-05 5.83199598e-03]
  [2.42056927e-07 5.03081223e-11 1.36549785e-08 9.99999762e-01]
  [2.35419628e-10 2.77804841e-12 4.56723626e-07 9.99999523e-01]
  [1.78725836e-11 4.51562100e-13 4.06795664e-08 1.00000000e+00]
  [1.18065930e-11 2.19476862e-13 1.51463766e-08 1.00000000e+00]
  [9.28063182e-12 1.59068654e-13 8.83606699e-09 1.00000000e+00]
  [7.67166972e-12 1.40210082e-13 6.45302389e-09 1.00000000e+00]
  [7.09521634e-12 1.42344198e-13 5.29465449e-09 1.00000000e+00]
  [6.62959661e-12 1.54843599e-13 4.34221

In [149]:
y_pred = model.predict(x_test, batch_size=1, verbose=1)




In [150]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out


x_labels = pred2label(y_pred)
y_labels = pred2label(y_test)

In [151]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import multilabel_confusion_matrix

print("Precision: {:.1%}".format(precision_score(y_labels, x_labels)))
print("Recall: {:.1%}".format(recall_score(y_labels, x_labels)))
print("F1-score: {:.1%}".format(f1_score(y_labels, x_labels)))


Precision: 57.1%
Recall: 67.6%
F1-score: 61.9%


In [152]:
#print(np.array(y_test))
#print(np.round(y_pred))

print(classification_report(y_labels, x_labels))
print(multilabel_confusion_matrix(y_labels[0], x_labels[0]))


              precision    recall  f1-score   support

     SYMPTOM       0.57      0.69      0.62        70
           _       0.00      0.00      0.00         1

   micro avg       0.57      0.68      0.62        71
   macro avg       0.29      0.34      0.31        71
weighted avg       0.56      0.68      0.61        71

[[[15  1]
  [ 1  2]]

 [[15  2]
  [ 0  2]]

 [[ 5  0]
  [ 2 12]]]


  _warn_prf(average, modifier, msg_start, len(result))
