<a href="https://colab.research.google.com/github/Sion1225/Study-Deeplearning-NLP/blob/master/NER%20using%20BiLSTM-CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install keras-crf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-crf
  Downloading keras_crf-0.3.0-py3-none-any.whl (8.3 kB)
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-addons, keras-crf
Successfully installed keras-crf-0.3.0 tensorflow-addons-0.19.0


## DataSet: Annotated Corpus for Named Entity Recognition 
(https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus)

Pre-processing process is from "BIO NER using BiLSTM & F1-score.ipynb"

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [3]:
data = pd.read_csv("/content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/DataSet/ner_dataset.csv", encoding="latin1")

In [4]:
data = data.fillna(method="ffill")
data["Word"] = data["Word"].str.lower()

In [5]:
func = lambda temp: [(w, t) for w, t in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences = [t for t in data.groupby("Sentence #").apply(func)]

In [6]:
sentences, ner_tags = [], []

for tagged_sentence in tagged_sentences :
    sentence, tag_info = zip(*tagged_sentence)
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

In [7]:
src_tokenizer = Tokenizer(oov_token="OOV")
tar_tokenizer = Tokenizer(lower=False)

src_tokenizer.fit_on_texts(sentences)
tar_tokenizer.fit_on_texts(ner_tags)

In [8]:
vocab_size = len(src_tokenizer.word_index) + 1
tag_size = len(tar_tokenizer.word_index) + 1

In [9]:
X_data = src_tokenizer.texts_to_sequences(sentences)
y_data = tar_tokenizer.texts_to_sequences(ner_tags)

In [10]:
word_to_index = src_tokenizer.word_index
index_to_word = src_tokenizer.index_word
ner_to_index = tar_tokenizer.word_index
index_to_ner = tar_tokenizer.index_word

index_to_ner[0] = "PAD" # 0 for padding

In [11]:
max_len = 70
X_data = pad_sequences(X_data, padding="post", maxlen=max_len)
y_data = pad_sequences(y_data, padding="post", maxlen=max_len)

In [12]:
X_train, X_test, y_train_int, y_test_int  = train_test_split(X_data, y_data, test_size=.2, random_state=1225)

In [13]:
y_train = to_categorical(y_train_int, num_classes=tag_size)
y_test = to_categorical(y_test_int, num_classes=tag_size)

In [14]:
print("Shape of training sentences sample : ", X_train.shape)
print("Shape of training label sample : ", y_train_int.shape)
print("Shape of test sentences sample : ", X_test.shape)
print("Shape of test label sample : ", y_test_int.shape)

Shape of training sentences sample :  (38367, 70)
Shape of training label sample :  (38367, 70)
Shape of test sentences sample :  (9592, 70)
Shape of test label sample :  (9592, 70)


#Model (BiLSTM-CRF)

In [15]:
pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16179 sha256=50df1b59cb7a2c075b0b02b671b21d0b47c6f93fa43e55e27a36dbaaec9d74f4
  Stored in directory: /root/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [16]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, LSTM, Input, Bidirectional, TimeDistributed, Embedding, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras_crf import CRFModel
from seqeval.metrics import f1_score, classification_report

In [17]:
embedding_dim = 128
hidden_units = 64
dropout_ratio = 0.3

In [18]:
sequence_input = Input(shape=(max_len,), dtype=tf.int32, name="sequence_input")
hidden = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)(sequence_input)
hidden = Bidirectional(LSTM(units=hidden_units, return_sequences=True))(hidden)
hidden = TimeDistributed(Dropout(dropout_ratio))(hidden)
BiLSTM_ouputs = TimeDistributed(Dense(tag_size, activation='relu'))(hidden)
base = Model(sequence_input, BiLSTM_ouputs)
model = CRFModel(base, tag_size)

model.compile(optimizer=tf.keras.optimizers.Nadam(0.001), metrics="accuracy")

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [19]:
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=4)
mc = ModelCheckpoint("/content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt", monitor="val_decode_sequence_accuracy", mode="max", verbose=1, save_best_only=True, save_weights_only=True)

In [20]:
history = model.fit(X_train, y_train_int, batch_size=128, epochs=15, validation_split=0.1, callbacks=[mc, es])

Epoch 1/15
Epoch 1: val_decode_sequence_accuracy improved from -inf to 0.96019, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 2/15
Epoch 2: val_decode_sequence_accuracy improved from 0.96019 to 0.98047, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 3/15
Epoch 3: val_decode_sequence_accuracy improved from 0.98047 to 0.98452, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 4/15
Epoch 4: val_decode_sequence_accuracy improved from 0.98452 to 0.98548, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 5/15
Epoch 5: val_decode_sequence_accuracy improved from 0.98548 to 0.98588, saving model to /content/drive/MyDrive/GitHub/Study-Deeplearning-NLP/Models/BiLSTM_CRF.ckpt
Epoch 6/15
Epoch 6: val_decode_sequence_accuracy improved from 0.98588 to 0.98599, saving model to /content/drive/MyDrive/GitHub