In [175]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import tensorflow_addons as tfa

from math import nan
from tensorflow.keras.callbacks import ModelCheckpoint


In [176]:
dataset = pd.read_csv("./data/ner.csv")

In [177]:
dataset = dataset.dropna(axis=0)

In [178]:
class SentenceGetter(object):

    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        def agg_func(s): return [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [179]:
getter = SentenceGetter(dataset)

In [180]:
sentences = getter.sentences

print(sentences[1])


[('din', 'O'), ('sikip', 'B-SYMPTOM'), ('dibdib', 'I-SYMPTOM'), ('ranas', 'O'), ('ubo', 'B-SYMPTOM')]


In [181]:
maxlen = max([len(s) for s in sentences])
print('Maximum sequence length:', maxlen)


Maximum sequence length: 8


In [182]:
tags = []
for tag in set(dataset["tag"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('UNKNOWN')
    else:
        tags.append(tag)
print(tags)


['O', 'B-SYMPTOM', 'I-SYMPTOM']


In [183]:
n_tags = len(tags)
n_tags


3

In [184]:
words = list(set(dataset["word"].values))
words.append("END")
words.append("UNKNOWN")

n_words = len(words)
n_words

77

In [185]:
from future.utils import iteritems
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}


In [186]:
from keras.utils.data_utils import pad_sequences
x = [[word2idx[w[0]] for w in s] for s in sentences]

In [187]:
x = pad_sequences(maxlen=maxlen, sequences=x, padding="post", value=n_words - 1)

In [188]:
y_idx = [[tag2idx[w[1]] for w in s] for s in sentences]

In [189]:
y = pad_sequences(maxlen=maxlen, sequences=y_idx, padding="post", value=tag2idx["O"])

In [191]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 0)

In [192]:
from keras.utils import to_categorical
y_test_categorical = [to_categorical(i, num_classes=n_tags) for i in y_test]


In [193]:
x_train.shape


(33, 8)

In [194]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, GRU, Embedding, Dense, TimeDistributed, Bidirectional, Activation
from tensorflow.keras import Sequential


In [195]:
#x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
#y_train = tf.convert_to_tensor(y_train, dtype=tf.float32)
#x_test = tf.convert_to_tensor(x_test, dtype=tf.float32)
#y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)


In [196]:
#from tensorflow_addons.layers.crf import CRF
#from keras_crf import CRFModel
from tf2crf import CRF, ModelWithCRFLoss

MODEL_TYPE = 'GRU'

word_embedding_size = 300

inputs = tf.keras.layers.Input(shape=(None,), dtype='int32')
output = Embedding(n_words, word_embedding_size, trainable=True, mask_zero=True)(inputs)
if (MODEL_TYPE == 'LSTM'):
  bi_rnn = Bidirectional(LSTM(units=word_embedding_size,
                            return_sequences=True,
                            dropout=0.5,
                            recurrent_dropout=0.5,
                              kernel_initializer=tf.keras.initializers.he_normal()))(output)
  rnn = LSTM(units=word_embedding_size * 2,
                              return_sequences=True,
                              dropout=0.5,
                              recurrent_dropout=0.5,
                              kernel_initializer=tf.keras.initializers.he_normal())(bi_rnn)
else:
  bi_rnn = Bidirectional(GRU(units=word_embedding_size,
                              return_sequences=True,
                              dropout=0.5,
                              recurrent_dropout=0.5,
                              kernel_initializer=tf.keras.initializers.he_normal()))(output)
  rnn = GRU(units=word_embedding_size * 2,
             return_sequences=True,
             dropout=0.5,
             recurrent_dropout=0.5,
             kernel_initializer=tf.keras.initializers.he_normal())(bi_rnn)
crf = CRF(units=n_tags, dtype='float32')
output = crf(rnn)
base_model = Model(inputs, output)
model = ModelWithCRFLoss(base_model, sparse_target=True)


In [197]:
adam = tf.keras.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)
model.compile(
    optimizer=adam,
    metrics=['acc'],
    loss="categorical_crossentropy"
)

In [198]:
model.build(x.shape)
model.summary()

Model: "model_with_crf_loss_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_5 (Functional)        ((None, None),            3272112   
                              (None, None, 3),                   
                              (None,),                           
                              (3, 3))                            
                                                                 
Total params: 3,272,116
Trainable params: 3,272,112
Non-trainable params: 4
_________________________________________________________________


In [199]:
# Saving the best only
# if (MODEL_TYPE == 'LSTM'):
#     filepath = "bilstm.h5"
# else:
#     filepath = "bigru.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_val_accuracy', verbose=1, save_best_only=True, mode='max')
# callbacks_list = [checkpoint]

In [200]:
model_history = model.fit(x_train, np.array(y_train), batch_size=256, epochs=200,
                    validation_split=0.1, verbose=1, callbacks=None)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [201]:
if (MODEL_TYPE == 'LSTM'):
    model.save_weights('bilstm')
else:
    model.save_weights('bigru')

In [190]:
# Save all auxiliary model data to json
import json

ner_config = {
    "n_tags": n_tags,
    "n_words": n_words,
    "maxlen": maxlen,
    "shape": x.shape,
    "word_embedding_size": word_embedding_size
}

folder_name = 'cfg'

with open("{}/{}.json".format(folder_name, "word_list"), "w") as file_path:
    json.dump(word2idx, file_path)

with open("{}/{}.json".format(folder_name, "ner_config"), "w") as file_path:
    json.dump(ner_config, file_path)

with open("{}/{}.json".format(folder_name, "tags"), "w") as file_path:
    json.dump(tags, file_path)


In [203]:
if (MODEL_TYPE == 'LSTM'):
    model.load_weights('bilstm')
else:
    model.load_weights('bigru')


In [204]:
x_test[1]

array([52, 57, 62, 59, 44, 38, 76, 76])

In [205]:
i = 0
p = model.predict(np.array([x_test[i]]))
print(p[0])
#p = np.argmax(p, axis=-1)
gt = np.argmax(y_test_categorical[i], axis=-1)
print("{:15} {:15} {}".format("Word", "Tag", "Predicted"))
for i, actual_tag, pred_tag in zip(x_test[i], gt, p[0]):
    print("{:15} {:15} {}".format(words[i], idx2tag[actual_tag], idx2tag[pred_tag]))
            
# for idx, (w,pred) in enumerate(zip(x_test[i],p)):
#     print("{:15} {:15} {}".format(words[w],idx2tag[gt[idx]],tags[pred]))

[1 2 0 0 1 0 0 0]
Word            Tag             Predicted
sakit           B-SYMPTOM       B-SYMPTOM
ulo             I-SYMPTOM       I-SYMPTOM
rin             O               O
lagi            O               O
lamig           B-SYMPTOM       B-SYMPTOM
UNKNOWN         O               O
UNKNOWN         O               O
UNKNOWN         O               O


In [206]:
y_pred = model.predict(x_test, batch_size=1, verbose=1)




In [207]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for tag in pred_i:
            out_i.append(idx2tag[tag].replace("PAD", "O"))
        out.append(out_i)
    return out


x_labels = pred2label(y_pred)
y_labels = pred2label(y_test)


In [208]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import multilabel_confusion_matrix

print("Precision: {:.1%}".format(precision_score(y_labels, x_labels)))
print("Recall: {:.1%}".format(recall_score(y_labels, x_labels)))
print("F1-score: {:.1%}".format(f1_score(y_labels, x_labels)))


Precision: 77.8%
Recall: 65.6%
F1-score: 71.2%


In [209]:
#print(np.array(y_test))
#print(np.round(y_pred))

print(classification_report(y_labels, x_labels, zero_division=0))
print(multilabel_confusion_matrix(y_labels[1], x_labels[1]))


              precision    recall  f1-score   support

     SYMPTOM       0.78      0.66      0.71        32

   micro avg       0.78      0.66      0.71        32
   macro avg       0.78      0.66      0.71        32
weighted avg       0.78      0.66      0.71        32

[[[3 0]
  [1 4]]

 [[7 1]
  [0 0]]

 [[5 0]
  [0 3]]]
