<a href="https://colab.research.google.com/github/Sion1225/Study-Deeplearning-NLP/blob/master/NER%20using%20BiLSTM-CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install keras-crf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-crf
  Downloading keras_crf-0.3.0-py3-none-any.whl (8.3 kB)
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-addons, keras-crf
Successfully installed keras-crf-0.3.0 tensorflow-addons-0.19.0


## DataSet: Annotated Corpus for Named Entity Recognition 
(https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus)

Pre-processing process is from "BIO NER using BiLSTM & F1-score.ipynb"

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [35]:
data = pd.read_csv("/content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/DataSet/ner_dataset.csv", encoding="latin1")

In [36]:
data = data.fillna(method="ffill")
data["Word"] = data["Word"].str.lower()

In [37]:
func = lambda temp: [(w, t) for w, t in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences = [t for t in data.groupby("Sentence #").apply(func)]

In [38]:
sentences, ner_tags = [], []

for tagged_sentence in tagged_sentences :
    sentence, tag_info = zip(*tagged_sentence)
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

In [39]:
src_tokenizer = Tokenizer(oov_token="OOV")
tar_tokenizer = Tokenizer(lower=False)

src_tokenizer.fit_on_texts(sentences)
tar_tokenizer.fit_on_texts(ner_tags)

In [40]:
vocab_size = len(src_tokenizer.word_index) + 1
tag_size = len(tar_tokenizer.word_index) + 1

In [41]:
X_data = src_tokenizer.texts_to_sequences(sentences)
y_data = tar_tokenizer.texts_to_sequences(ner_tags)

In [42]:
word_to_index = src_tokenizer.word_index
index_to_word = src_tokenizer.index_word
ner_to_index = tar_tokenizer.word_index
index_to_ner = tar_tokenizer.index_word

index_to_ner[0] = "PAD" # 0 for padding

In [43]:
max_len = 70
X_data = pad_sequences(X_data, padding="post", maxlen=max_len)
y_data = pad_sequences(y_data, padding="post", maxlen=max_len)

In [44]:
X_train, X_test, y_train_int, y_test_int  = train_test_split(X_data, y_data, test_size=.2, random_state=1225)

In [45]:
y_train = to_categorical(y_train_int, num_classes=tag_size)
y_test = to_categorical(y_test_int, num_classes=tag_size)

In [46]:
print("Shape of training sentences sample : ", X_train.shape)
print("Shape of training label sample : ", y_train_int.shape)
print("Shape of test sentences sample : ", X_test.shape)
print("Shape of test label sample : ", y_test_int.shape)

Shape of training sentences sample :  (38367, 70)
Shape of training label sample :  (38367, 70)
Shape of test sentences sample :  (9592, 70)
Shape of test label sample :  (9592, 70)


In [47]:
print(y_train.shape)
print(y_test.shape)

(38367, 70, 18)
(9592, 70, 18)


#Model (BiLSTM-CRF)

In [15]:
pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=50df1b59cb7a2c075b0b02b671b21d0b47c6f93fa43e55e27a36dbaaec9d74f4
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [16]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, LSTM, Input, Bidirectional, TimeDistributed, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras_crf import CRFModel
from seqeval.metrics import f1_score, classification_report

In [17]:
embedding_dim = 128
hidden_units = 64
dropout_ratio = 0.3

In [18]:
sequence_input = Input(shape=(max_len,), dtype=tf.int32, name="sequence_input")
hidden = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(sequence_input)
hidden = Bidirectional(LSTM(units=hidden_units, return_sequences=True))(hidden)
hidden = TimeDistributed(Dropout(dropout_ratio))(hidden)
BiLSTM_ouputs = TimeDistributed(Dense(tag_size, activation='relu'))(hidden)
base = Model(sequence_input, BiLSTM_ouputs)
model = CRFModel(base, tag_size)

model.compile(optimizer=tf.keras.optimizers.Nadam(0.001), metrics="accuracy")

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [19]:
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=4)
mc = ModelCheckpoint("/content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt", monitor="val_decode_sequence_accuracy", mode="max", verbose=1, save_best_only=True, save_weights_only=True)

In [20]:
history = model.fit(X_train, y_train_int, batch_size=128, epochs=15, validation_split=0.1, callbacks=[mc, es])

Epoch 1/15
Epoch 1: val_decode_sequence_accuracy improved from -inf to 0.96019, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 2/15
Epoch 2: val_decode_sequence_accuracy improved from 0.96019 to 0.98047, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 3/15
Epoch 3: val_decode_sequence_accuracy improved from 0.98047 to 0.98452, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 4/15
Epoch 4: val_decode_sequence_accuracy improved from 0.98452 to 0.98548, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 5/15
Epoch 5: val_decode_sequence_accuracy improved from 0.98548 to 0.98588, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 6/15
Epoch 6: val_decode_sequence_accuracy improved from 0.98588 to 0.98599, saving model to /content/drive/MyDrive/GitHub

In [21]:
model.load_weights("/content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fc36cfdbd60>

In [48]:
i = 10
y_predicted = model.predict(np.array([X_test[i]]))[0]
labels = np.argmax(y_test[i], -1)



In [49]:
print(y_predicted[0])

[ 1  1  6  1  6  5  4  7  1  1  1  1  1  1  1  3 10 10 10  1  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [50]:
print("{:15}|{:12}|{}".format("Voca","Real Value", "Predicted"))
for word, tag, pred in zip(X_test[i], labels, y_predicted[0]) :
    if word != 0 :
        print("{:15}|{:12}|{}".format(index_to_word[word], index_to_ner[tag], index_to_ner[pred]))

Voca           |Real Value  |Predicted
the            |O           |O
"              |O           |O
roe            |B-per       |B-per
versus         |O           |O
wade           |B-per       |B-per
"              |O           |I-per
supreme        |B-org       |B-org
court          |I-org       |I-org
decision       |O           |O
legalizing     |O           |O
abortion       |O           |O
was            |O           |O
handed         |O           |O
down           |O           |O
on             |O           |O
january        |B-tim       |B-tim
22             |I-tim       |I-tim
,              |I-tim       |I-tim
1973           |I-tim       |I-tim
.              |O           |O


#F1-score
f1 score = 2 * { (precision * recall) / (precision+recall) }

In [58]:
from seqeval.metrics import f1_score, classification_report

In [71]:
y_predicted = model.predict(X_test)[0]



In [51]:
def sequences_to_tag(sequences) :
    result = []

    for sequence in sequences :
        word_sequence = []

        for pred in sequence :
            pred_index = np.argmax(pred)
            word_sequence.append(index_to_ner[pred_index].replace("PAD","O"))

        result.append(word_sequence)

    return result

In [52]:
def sequences_to_tag_for_crf(sequences) :
    result = []

    for sequence in sequences :
        word_sequence = []
        
        for pred_index in sequence :
            word_sequence.append(index_to_ner[pred_index].replace("PAD", "O"))
        
        result.append(word_sequence)
    
    return result

In [72]:
pred_tags = sequences_to_tag_for_crf(y_predicted)
test_tags = sequences_to_tag(y_test)

In [75]:
print(y_predicted[10])
print(y_test[10])

[ 1  1  6  1  6  5  4  7  1  1  1  1  1  1  1  3 10 10 10  1  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [76]:
print(pred_tags[10])
print(test_tags[10])

['O', 'O', 'B-per', 'O', 'B-per', 'I-per', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'I-tim', 'I-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'B-per', 'O', 'B-per', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'I-tim', 'I-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [77]:
print("F1-score: {:.2%}".format(f1_score(test_tags, pred_tags)))
print(classification_report(test_tags, pred_tags))

F1-score: 79.42%


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        91
         eve       0.00      0.00      0.00        59
         geo       0.82      0.87      0.84      7390
         gpe       0.94      0.92      0.93      3166
         nat       1.00      0.07      0.12        30
         org       0.67      0.56      0.61      4004
         per       0.73      0.69      0.71      3336
         tim       0.87      0.83      0.85      4032

   micro avg       0.81      0.78      0.79     22108
   macro avg       0.63      0.49      0.51     22108
weighted avg       0.80      0.78      0.79     22108

