# Imports

In [30]:
import collections
import csv
import glob
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow_addons as tfa
import tensorflow as tf
import os

from tensorflow.keras import backend as K
from tensorflow.keras.layers import (
    Embedding,
    Bidirectional,
    LSTM,
    TimeDistributed,
    Dense,
    Layer,
)
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

 The versions of TensorFlow you are currently using is 2.4.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# Get Data

In [2]:
data_root = "../data/raw/gmb-2.2.0/data/"

fnames = []
for root, dirs, files in os.walk(data_root):
    for filename in files:
        if filename.endswith(".tags"):
            fnames.append(os.path.join(root, filename))

In [3]:
fnames[:2]

['../data/raw/gmb-2.2.0/data/p37/d0625/en.tags',
 '../data/raw/gmb-2.2.0/data/p37/d0686/en.tags']

# Set-up

In [4]:
ner_tags = collections.Counter()
iob_tags = collections.Counter()

# Functions

In [5]:
def strip_ner_subcat(tag: str) -> str:
    """
    Strip the NER subcategory from a tag.
    """
    return tag.split("-")[0]

In [6]:
def iob_format(ners: str) -> str:
    """
    Converts IO tags into IOB format
    """
    iob_tokens = []
    for idx, token in enumerate(ners):
        if token != "O":
            if idx == 0:
                token = "B-" + token
            elif ners[idx - 1] == token:
                token = "I-" + token
            else:
                token = "B-" + token

        iob_tokens.append(token)
        iob_tags[token] += 1

    return iob_tokens

# Preparing data

1. A counter is set for the **number of sentences**. 
2. A list of files written with paths are also initialized. 
3. As processed files are written out, their paths are added to the
outfiles variable. 
4. This list will be used later to load all the data and to train the
model. 
5. Files are read and split into two empty newline characters. That is the marker
for the end of a sentence in the file. Only the actual words, **POS** tokens, and **NER**
tokens are used from the file. 
6. Once these are collected, a new CSV file is written with
three columns: the sentence, a sequence of **POS** tags, and a sequence of **NER** tags.

In [None]:
total_sentences = 0
outfiles = []
path = "../data/preprocessed/ner/"
for idx, file in enumerate(fnames):
    with open(file, "rb") as content:
        data = content.read().decode("utf-8").strip()
        sentences = data.split("\n\n")
        print(idx, file, len(sentences))
        total_sentences += len(sentences)

        with open(path + str(idx) + "-" + os.path.basename(file), "w") as outfile:
            outfiles.append(path + str(idx) + "-" + os.path.basename(file))
            writer = csv.writer(outfile)

            for sentence in sentences:
                toks = sentence.split("\n")
                words, pos, ner = [], [], []

                for tok in toks:
                    t = tok.split("\t")
                    words.append(t[0])
                    pos.append(t[1])
                    ner_tags[t[3]] += 1
                    ner.append(strip_ner_subcat(t[3]))
                writer.writerow(
                    [" ".join(words), " ".join(iob_format(ner)), " ".join(pos)]
                )

# Data Analysis

In [8]:
print("total number of sentences: ", total_sentences)

total number of sentences:  62010


In [9]:
print(ner_tags)
print(iob_tags)

Counter({'O': 1146068, 'geo-nam': 58388, 'org-nam': 48034, 'per-nam': 23790, 'gpe-nam': 20680, 'tim-dat': 12786, 'tim-dow': 11404, 'per-tit': 9800, 'per-fam': 8152, 'tim-yoc': 5290, 'tim-moy': 4262, 'per-giv': 2413, 'tim-clo': 891, 'art-nam': 866, 'eve-nam': 602, 'nat-nam': 300, 'tim-nam': 146, 'eve-ord': 107, 'org-leg': 60, 'per-ini': 60, 'per-ord': 38, 'tim-dom': 10, 'per-mid': 1, 'art-add': 1})
Counter({'O': 1146068, 'B-geo': 48876, 'B-tim': 26296, 'B-org': 26195, 'I-per': 22270, 'B-per': 21984, 'I-org': 21899, 'B-gpe': 20436, 'I-geo': 9512, 'I-tim': 8493, 'B-art': 503, 'B-eve': 391, 'I-art': 364, 'I-eve': 318, 'I-gpe': 244, 'B-nat': 238, 'I-nat': 62})


# Normalizing and vectorizing data

In [10]:
files = glob.glob("../data/preprocessed/ner/*.tags")

data_pd = pd.concat(
    [pd.read_csv(f, header=None, names=["text", "label", "pos"]) for f in files],
    ignore_index=True,
)

In [11]:
data_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62010 entries, 0 to 62009
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    62010 non-null  object
 1   label   62010 non-null  object
 2   pos     62010 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


In [12]:
text_tok = Tokenizer(filters="[\\]^\t\n", lower=False, split=" ", oov_token="<OOV>")

pos_tok = Tokenizer(filters="\t\n", lower=False, split=" ", oov_token="<OOV>")

ner_tok = Tokenizer(filters="\t\n", lower=False, split=" ", oov_token="<OOV>")

In [13]:
text_tok.fit_on_texts(data_pd["text"])
pos_tok.fit_on_texts(data_pd["pos"])
ner_tok.fit_on_texts(data_pd["label"])

In [14]:
ner_config = ner_tok.get_config()
text_config = text_tok.get_config()

In [15]:
print(ner_config)

{'num_words': None, 'filters': '\t\n', 'lower': False, 'split': ' ', 'char_level': False, 'oov_token': '<OOV>', 'document_count': 62010, 'word_counts': '{"B-gpe": 20436, "O": 1146068, "B-geo": 48876, "B-tim": 26296, "B-per": 21984, "I-per": 22270, "B-org": 26195, "I-org": 21899, "I-geo": 9512, "B-nat": 238, "I-nat": 62, "I-tim": 8493, "B-art": 503, "B-eve": 391, "I-eve": 318, "I-gpe": 244, "I-art": 364}', 'word_docs': '{"O": 61999, "B-gpe": 16565, "B-geo": 31660, "B-tim": 22345, "B-per": 17499, "I-per": 13805, "B-org": 20478, "I-org": 11011, "I-geo": 7738, "B-nat": 211, "I-nat": 50, "I-tim": 5526, "B-art": 425, "I-eve": 201, "B-eve": 361, "I-gpe": 224, "I-art": 207}', 'index_docs': '{"2": 61999, "9": 16565, "3": 31660, "4": 22345, "7": 17499, "6": 13805, "5": 20478, "8": 11011, "10": 7738, "17": 211, "18": 50, "11": 5526, "12": 425, "15": 201, "13": 361, "16": 224, "14": 207}', 'index_word': '{"1": "<OOV>", "2": "O", "3": "B-geo", "4": "B-tim", "5": "B-org", "6": "I-per", "7": "B-per",

In [16]:
text_vocab = eval(text_config["index_word"])
ner_vocab = eval(ner_config["index_word"])

In [17]:
print("Unique words in vocab:", len(text_vocab))
print("Unique NER tags in vocab:", len(ner_vocab))

Unique words in vocab: 39422
Unique NER tags in vocab: 18


In [18]:
x_tok = text_tok.texts_to_sequences(data_pd["text"])
y_tok = ner_tok.texts_to_sequences(data_pd["label"])

In [19]:
max_len = 50

x_pad = sequence.pad_sequences(x_tok, padding="post", maxlen=max_len)

y_pad = sequence.pad_sequences(y_tok, padding="post", maxlen=max_len)

In [20]:
x_pad.shape, y_pad.shape

((62010, 50), (62010, 50))

In [21]:
num_classes = len(ner_vocab) + 1

Y = tf.keras.utils.to_categorical(y_pad, num_classes=num_classes)
Y.shape

(62010, 50, 19)

# Building and traning the BiLSTM Model

In [22]:
vocab_size = len(text_vocab) + 1

embedding_dim = 64

rnn_units = 100

BATCH_SIZE = 90

num_classes = len(ner_vocab) + 1

dropout = 0.2

In [23]:
def build_model_bilstm(
    vocab_size, embedding_dim, rnn_units, batch_size, classes
) -> tf.keras.Model:
    model = tf.keras.Sequential(
        [
            Embedding(
                vocab_size,
                embedding_dim,
                mask_zero=True,
                batch_input_shape=[batch_size, None],
            ),
            Bidirectional(
                LSTM(
                    units=rnn_units,
                    return_sequences=True,
                    dropout=dropout,
                    kernel_initializer=tf.keras.initializers.he_normal(),
                )
            ),
            TimeDistributed(Dense(rnn_units, activation="relu")),
            Dense(num_classes, activation="softmax"),
        ]
    )

    return model

In [24]:
model = build_model_bilstm(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
    classes=num_classes,
)

2023-10-23 16:56:52.219885: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-10-23 16:56:52.235463: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-10-23 16:56:52.235492: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2023-10-23 16:56:52.235513: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pop-os): /proc/driver/nvidia/version does not exist
2023-10-23 16:56:52.238473: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set


In [25]:
model = build_model_bilstm(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
    classes=num_classes,
)
model.summary()
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (90, None, 64)            2523072   
_________________________________________________________________
bidirectional_1 (Bidirection (90, None, 200)           132000    
_________________________________________________________________
time_distributed_1 (TimeDist (90, None, 100)           20100     
_________________________________________________________________
dense_3 (Dense)              (90, None, 19)            1919      
Total params: 2,677,091
Trainable params: 2,677,091
Non-trainable params: 0
_________________________________________________________________


In [26]:
X = x_pad

# create training and testing splits
total_sentences = 62010
test_size = round(total_sentences / BATCH_SIZE * 0.2)
X_train = X[BATCH_SIZE * test_size :]
Y_train = Y[BATCH_SIZE * test_size :]

X_test = X[0 : BATCH_SIZE * test_size]
Y_test = Y[0 : BATCH_SIZE * test_size]

In [27]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((49590, 50), (49590, 50, 19), (12420, 50), (12420, 50, 19))

In [28]:
model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=15)

2023-10-23 16:56:55.723321: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 188442000 exceeds 10% of free system memory.
2023-10-23 16:56:56.008152: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2023-10-23 16:56:56.048415: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 3393245000 Hz


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fc147ceae20>

In [29]:
model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)



[0.09432904422283173, 0.9620593190193176]

# Saving BiLSTM model

In [31]:
model.save("../models/BiLSTM.h5")