In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("NER dataset.csv", encoding='utf-8')
df = df.fillna(method='ffill')
df.head(-5)

In [None]:
df['Sentence Id'].nunique(), df.Word.nunique(), df.Tag.nunique()

In [None]:
df.groupby('Tag').size().reset_index(name='counts')

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Tag'])

In [None]:
df['Enc_tag'] = le.transform(df['Tag'])

In [None]:
class SentenceGetter(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        agg_func = lambda s: [w for w in s['Word'].values.tolist()]
        self.grouped = self.df.groupby('Sentence Id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [None]:
getter = SentenceGetter(df)
sentences = getter.sentences

In [None]:
class POSGetter(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        agg_func = lambda s: [w for w in s['POS'].values.tolist()]
        self.grouped = self.df.groupby('Sentence Id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [None]:
getter = POSGetter(df)
POS_ = getter.sentences

In [None]:
class TagGetter(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        agg_func = lambda s: [w for w in s['Enc_tag'].values.tolist()]
        self.grouped = self.df.groupby('Sentence Id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [None]:
getter = TagGetter(df)
Tags= getter.sentences

In [None]:
data = {'Sentence Id' : df['Sentence Id'].unique(),'Word': sentences,'POS': POS_ ,'Tag': Tags }
df1 = pd.DataFrame(data = data)

In [None]:
from sklearn.model_selection import train_test_split

training_dataset, testing_dataset = train_test_split(df1, test_size=0.2, random_state=2018)

# Conditional Random Fields classifier


In [None]:
!pip install -q tensorflow-addons  # version >= 0.15.0 is required
!pip install -q tensorflow
!pip install -q datasets

In [None]:
import pandas as pd
from datasets import Dataset

def dataframe_to_conll(df):
    conll_lines = []
    for i, row in df.iterrows():
        word = row['Word']
        pos = row['POS']
        ner = row['Tag']
        conll_lines.append(f"{word}{pos}{ner}")

    conll_dataset = '\n'.join(conll_lines)
    return conll_dataset

conll_dataset = dataframe_to_conll(training_dataset)
dataset = Dataset.from_pandas(training_dataset)

In [None]:
import copy
import tensorflow_addons as tfa

The dataset also give the information about the mapping of NER tags and ids.

In [None]:
raw_tags = df.Tag.unique().tolist()
print(raw_tags)

Add a special tag `<PAD>` to the tag set which is used to represent a padding in the sequence. In NLP, 0 is usually used to mark padding.

In [None]:
tags = ['<PAD>'] + raw_tags
print(tags)

In [None]:
from sklearn import preprocessing
le2 = preprocessing.LabelEncoder()
le2.fit(tags)

Define some constants which will be used in later.

In [None]:
TAG_SIZE = len(tags)
VOCAB_SIZE = 20000

Building vocabulary lookup layer for tokens.

In [None]:
train_tokens = tf.ragged.constant(dataset['Word'])
train_tokens = tf.map_fn(tf.strings.lower, train_tokens)

lookup_layer = tf.keras.layers.StringLookup(max_tokens=VOCAB_SIZE, mask_token="[MASK]", oov_token="[UNK]")
lookup_layer.adapt(train_tokens)

print(len(lookup_layer.get_vocabulary()))
print(lookup_layer.get_vocabulary()[:10])

Creating raw (without preprocess) train and validation dataset.

In [None]:
def create_data_generator(dataset):
  def data_generator():
    for item in dataset:
      yield item['Word'], item['Tag']

  return data_generator

data_signature= (
        tf.TensorSpec(shape=(None,), dtype=tf.string),
        tf.TensorSpec(shape=(None, ), dtype=tf.int32)
)

train_data = tf.data.Dataset.from_generator(
    create_data_generator(dataset),
    output_signature=data_signature
)

In [None]:
def dataset_preprocess(tokens, tag_ids):
    preprocessed_tokens = preprecess_tokens(tokens)

    # increase by 1 for all tag_ids,
    # because `<PAD>` is added as the first element in tags list
    preprocessed_tag_ids = tag_ids + 1

    return preprocessed_tokens, preprocessed_tag_ids

def preprecess_tokens(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

BATCH_SIZE = 64

# With `padded_batch(), each batch may have different length
# shape: (batch_size, None)
train_dataset = (
    train_data.map(dataset_preprocess)
    .padded_batch(batch_size=BATCH_SIZE).cache()
)

In [None]:
# Build the model
def build_embedding_bilstm_crf_model(
    vocab_size: int, embed_dims: int, lstm_unit: int, tag_size: int
) -> tf.keras.Model:
    x = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="x")
    y = tf.keras.layers.Embedding(vocab_size, embed_dims, mask_zero=True)(x)
    y = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_unit, return_sequences=True)
    )(y)
    decode_sequence, potentials, sequence_length, kernel = tfa.layers.CRF(tag_size)(y)

    return tf.keras.Model(
        inputs=x, outputs=[decode_sequence, potentials, sequence_length, kernel]
    )


model = build_embedding_bilstm_crf_model(VOCAB_SIZE, 32, 128, TAG_SIZE)

### Define CRF loss function

By using the real y and some internal variables of the CRF layer. You can compute the log likelihood of real y. Use the negative of log likelihood as the loss to optimize.

In [None]:
@tf.function
def crf_loss_func(potentials, sequence_length, kernel, y):
    crf_likelihood, _ = tfa.text.crf_log_likelihood(
        potentials, y, sequence_length, kernel
    )
    flat_crf_loss = -1 * crf_likelihood
    crf_loss = tf.reduce_mean(flat_crf_loss)

    return crf_loss

### Define optimizer, metrics and train_step fucntion

In [None]:
optimizer = tf.keras.optimizers.Adam(0.02)

train_loss = tf.keras.metrics.Mean(name="train_loss")

@tf.function(experimental_relax_shapes=True)
def train_step(x, y):
    with tf.GradientTape() as tape:
        decoded_sequence, potentials, sequence_length, kernel = model(x)
        crf_loss = crf_loss_func(potentials, sequence_length, kernel, y)
        loss = crf_loss + tf.reduce_sum(model.losses)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_loss(loss)

### Training model

In [None]:
EPOCHS = 40

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()

    for x, y in train_dataset:
        train_step(x, y)

    print(f"Epoch {epoch + 1}, " f"Loss: {train_loss.result()}")


### Making inference

In [None]:
# print the inputs and expected outputs
test_text = testing_dataset['Word'].iloc[15]
print("raw inputs: ", test_text)

# preprocess
preprocessed_inputs = preprecess_tokens(
    test_text
)
# expend the batch dim
inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

outputs, *_ = model.predict(inputs)
prediction = list(le2.inverse_transform(outputs[0]))
ground_true_tags = list(le.inverse_transform(testing_dataset['Tag'].iloc[15]))

print("ground true tags: ", ground_true_tags)
print("predicted tags: ", prediction)

In [None]:
y_true = []
y_pred = []
for i in range(len(testing_dataset)):
    # preprocess
    preprocessed_inputs = preprecess_tokens(
        testing_dataset['Word'].iloc[i]
    )
    # expend the batch dim
    inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

    outputs, *_ = model.predict(inputs)
    prediction = le2.inverse_transform(outputs[0])
    ground_true_tags = le.inverse_transform(testing_dataset['Tag'].iloc[i])
    prediction = list(le.transform(prediction))
    ground_true_tags = list(le.transform(ground_true_tags))
    y_true.extend(ground_true_tags)
    y_pred.extend(prediction)

In [None]:
myset = set(y_true).union(set(y_pred))
lst = list(myset)
labels = list(le.inverse_transform(lst))

In [None]:
y_true = le.inverse_transform(y_true)
y_pred = le.inverse_transform(y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

In [None]:
p = []
t = []
for i in y_pred:
  if i == 'O':
    p.append(i)
  else:
    p.append(i[2:])

for i in y_true:
  if i == 'O':
    t.append(i)
  else:
    t.append(i[2:])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(t, p))

In [None]:
report = classification_report(t, p, digits=4, output_dict=True)
# Access the weighted F1 score, recall, and precision
f1_weighted = report['weighted avg']['f1-score']

recall_weighted = report['weighted avg']['recall']
precision_weighted = report['weighted avg']['precision']

# Print the results

print ('Weighted F1 Score: ', f1_weighted)
print ('Weighted Recall: ', recall_weighted)
print ('Weighted Precision: ', precision_weighted)

In [None]:
report = classification_report(t, p, digits=4, output_dict=True)
# Access the weighted F1 score, recall, and precision
f1_weighted = report['macro avg']['f1-score']

recall_weighted = report['macro avg']['recall']
precision_weighted = report['macro avg']['precision']

# Print the results

print ('Macro F1 Score: ', f1_weighted)
print ('Macro Recall: ', recall_weighted)
print ('Macro Precision: ', precision_weighted)

# Case Study

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
test_text = """
Although the initial vector of compromise is unclear, the details of the next stage imply the use of spear phishing or similar methods.
 """

In [None]:
Case = word_tokenize(test_text)

preprocessed_inputs = preprecess_tokens(
    Case
)

inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

outputs, *_ = model.predict(inputs)
prediction = list(le2.inverse_transform(outputs[0]))

for token, label in zip(Case, prediction):
    print("{:20}\t{}".format(token, label))