In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
print('Tensorflow version: ', tf.__version__)

Tensorflow version:  2.13.0


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df = pd.read_csv(r"/content/drive/MyDrive/Mini ProjectRisk Msc DS/Main Project/NER/Dataset/NER dataset.csv", encoding='utf-8')
df = df.fillna(method='ffill')
df.head(20)

Unnamed: 0,Word,POS,Sentence Id,Tag
0,Chinese,JJ,1,O
1,tech,NN,1,O
2,giant,NN,1,O
3,Alibaba,NNP,1,B-IDENTITY
4,has,VBZ,1,O
5,reportedly,RB,1,O
6,been,VBN,1,O
7,shunned,VBN,1,O
8,by,IN,1,O
9,Chinas,NNP,1,B-LOCATION


In [42]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-ASSET,127
1,B-ATTACK_TYPE,715
2,B-CAMPAIGN,108
3,B-DATE_TIME,968
4,B-IDENTITY,3972
5,B-INDICATOR,345
6,B-INFRASTRUCTURE,348
7,B-LOCATION,1431
8,B-MALWARE,1111
9,B-THREAT_ACTOR,470


In [7]:
print('Unique words in corpus: ', df['Word'].nunique())
print('Unique tags in corpus: ', df['Tag'].nunique())

Unique words in corpus:  16201
Unique tags in corpus:  25


In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['Tag'])
df['Enc_tag'] = le.transform(df['Tag'])

##  Retrieve sentences and corresponding tags


In [10]:
class SentenceGetter(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        agg_func = lambda s: [w for w in s['Word'].values.tolist()]
        self.grouped = self.df.groupby('Sentence Id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [11]:
getter = SentenceGetter(df)
sentences = getter.sentences

In [12]:
class POSGetter(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        agg_func = lambda s: [w for w in s['POS'].values.tolist()]
        self.grouped = self.df.groupby('Sentence Id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [13]:
getter = POSGetter(df)
POS_ = getter.sentences

In [14]:
class TagGetter(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        agg_func = lambda s: [w for w in s['Enc_tag'].values.tolist()]
        self.grouped = self.df.groupby('Sentence Id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [15]:
getter = TagGetter(df)
Tags= getter.sentences

In [16]:
data = {'Sentence Id' : df['Sentence Id'].unique(),'Word': sentences,'POS': POS_ ,'Tag': Tags }
df1 = pd.DataFrame(data = data)

In [18]:
from sklearn.model_selection import train_test_split

training_dataset, testing_dataset = train_test_split(df1, test_size=0.2, random_state=2018)

In [19]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
import pandas as pd
from datasets import Dataset

def dataframe_to_conll(df):
    conll_lines = []
    for i, row in df.iterrows():
        word = row['Word']
        #pos = row['POS']
        ner = row['Tag']
        conll_lines.append(f"{word}{ner}")

    conll_dataset = '\n'.join(conll_lines)
    return conll_dataset

conll_dataset = dataframe_to_conll(training_dataset)
dataset = Dataset.from_pandas(training_dataset)

In [21]:
raw_tags = df.Tag.unique().tolist()
print(raw_tags)

['O', 'B-IDENTITY', 'B-LOCATION', 'B-TOOL', 'B-DATE_TIME', 'I-IDENTITY', 'I-DATE_TIME', 'B-MALWARE', 'B-ASSET', 'I-ASSET', 'B-ATTACK_TYPE', 'B-VULNERABILITY', 'I-LOCATION', 'B-INFRASTRUCTURE', 'I-MALWARE', 'B-THREAT_ACTOR', 'I-THREAT_ACTOR', 'I-TOOL', 'B-INDICATOR', 'I-INFRASTRUCTURE', 'I-ATTACK_TYPE', 'I-INDICATOR', 'B-CAMPAIGN', 'I-CAMPAIGN', 'I-VULNERABILITY']


## Padding input sentences and creating train/test split

In [22]:
tags = ['<PAD>'] + raw_tags
print(tags)

['<PAD>', 'O', 'B-IDENTITY', 'B-LOCATION', 'B-TOOL', 'B-DATE_TIME', 'I-IDENTITY', 'I-DATE_TIME', 'B-MALWARE', 'B-ASSET', 'I-ASSET', 'B-ATTACK_TYPE', 'B-VULNERABILITY', 'I-LOCATION', 'B-INFRASTRUCTURE', 'I-MALWARE', 'B-THREAT_ACTOR', 'I-THREAT_ACTOR', 'I-TOOL', 'B-INDICATOR', 'I-INFRASTRUCTURE', 'I-ATTACK_TYPE', 'I-INDICATOR', 'B-CAMPAIGN', 'I-CAMPAIGN', 'I-VULNERABILITY']


In [23]:
from sklearn import preprocessing
le2 = preprocessing.LabelEncoder()
le2.fit(tags)

In [24]:
TAG_SIZE = len(tags)
VOCAB_SIZE = 20000

In [25]:
import matplotlib.pyplot as plt
import copy

import numpy as np
import tensorflow as tf

In [26]:
train_tokens = tf.ragged.constant(dataset['Word'])
train_tokens = tf.map_fn(tf.strings.lower, train_tokens)

lookup_layer = tf.keras.layers.StringLookup(max_tokens=VOCAB_SIZE, mask_token="[MASK]", oov_token="[UNK]")
lookup_layer.adapt(train_tokens)

print(len(lookup_layer.get_vocabulary()))
print(lookup_layer.get_vocabulary()[:10])

12813
['[MASK]', '[UNK]', 'the', ',', '.', 'to', 'and', 'of', 'a', 'in']


In [27]:
def create_data_generator(dataset):
  def data_generator():
    for item in dataset:
      yield item['Word'], item['Tag']

  return data_generator

data_signature= (
        tf.TensorSpec(shape=(None,), dtype=tf.string),
        tf.TensorSpec(shape=(None, ), dtype=tf.int32)
)

train_data = tf.data.Dataset.from_generator(
    create_data_generator(dataset),
    output_signature=data_signature
)

In [28]:
def dataset_preprocess(tokens, tag_ids):
    preprocessed_tokens = preprecess_tokens(tokens)

    # increase by 1 for all tag_ids,
    # because `<PAD>` is added as the first element in tags list
    preprocessed_tag_ids = tag_ids + 1

    return preprocessed_tokens, preprocessed_tag_ids

def preprecess_tokens(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

BATCH_SIZE = 128

train_dataset = (
    train_data.map(dataset_preprocess)
    .padded_batch(batch_size=BATCH_SIZE).cache()
)

## Build and compile a Bidirectional LSTM model


In [29]:
def build_embedding_bilstm_model(
    vocab_size: int, embed_dims: int, lstm_units: int, tag_size: int
) -> tf.keras.Model:
    x = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name="x")
    y = tf.keras.layers.Embedding(vocab_size, embed_dims, mask_zero=True)(x)
    y = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_units, return_sequences=True)
    )(y)
    output = tf.keras.layers.Dense(tag_size, activation='softmax')(y) 

    return tf.keras.Model(inputs=x, outputs=output)


model = build_embedding_bilstm_model(VOCAB_SIZE, 64, 128, TAG_SIZE)

In [30]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## Train the model


In [31]:
train_dataset

<CacheDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int32, name=None))>

In [32]:
EPOCHS = 40
BATCH_SIZE = 32

history = model.fit(train_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


## Evaluate Named Entity Recognition model


In [33]:
test_conll_dataset = dataframe_to_conll(testing_dataset)
test_dataset = Dataset.from_pandas(testing_dataset)

def create_data_generator(dataset):
  def data_generator():
    for item in dataset:
      yield item['Word'], item['Tag']

  return data_generator

data_signature= (
        tf.TensorSpec(shape=(None,), dtype=tf.string),
        tf.TensorSpec(shape=(None, ), dtype=tf.int32)
)

test_data = tf.data.Dataset.from_generator(
    create_data_generator(test_dataset),
    output_signature=data_signature
)

test_dataset = (
    test_data.map(dataset_preprocess)
    .padded_batch(batch_size=BATCH_SIZE).cache()
)

In [34]:
model.evaluate(test_dataset)



[0.20131328701972961, 0.9592112302780151]

In [None]:
import numpy as np
predicted_tags_list = []
true_tags_list = []
for i in range(len(testing_dataset)):
    Test_case = testing_dataset.iloc[i]['Word']
    true_tags_list.extend(le.inverse_transform(testing_dataset.iloc[i]['Tag']))
    # Preprocess the test sentence (similar to what you did during training)
    preprocessed_test_sentence = preprecess_tokens(Test_case)

    # Reshape the preprocessed input to match the model's input shape
    input_sequence = np.array(preprocessed_test_sentence)
    input_sequence = np.expand_dims(input_sequence, axis=0)

    # Predict the tags for the test sentence
    predictions = model.predict(input_sequence)

    # Decode the predictions to obtain the predicted tags
    predicted_tags = np.argmax(predictions, axis=-1)[0]

    # Inverse transform the predicted tags to get the original labels
    predicted_tags_list.extend(list(le2.inverse_transform(predicted_tags)))

In [37]:
from sklearn.metrics import classification_report
print(classification_report(true_tags_list, predicted_tags_list))

                  precision    recall  f1-score   support

         B-ASSET       0.57      0.35      0.43        23
   B-ATTACK_TYPE       0.74      0.64      0.69        92
      B-CAMPAIGN       0.83      0.86      0.84        22
     B-DATE_TIME       0.82      0.85      0.83       167
      B-IDENTITY       0.69      0.71      0.70       821
     B-INDICATOR       0.47      0.15      0.23       117
B-INFRASTRUCTURE       0.51      0.46      0.48        61
      B-LOCATION       0.69      0.83      0.75       241
       B-MALWARE       0.81      0.85      0.83       183
  B-THREAT_ACTOR       0.58      0.73      0.64        78
          B-TOOL       0.58      0.68      0.63       149
 B-VULNERABILITY       1.00      0.31      0.47        13
         I-ASSET       0.50      0.36      0.42        22
   I-ATTACK_TYPE       0.44      0.34      0.39        35
      I-CAMPAIGN       0.85      0.85      0.85        20
     I-DATE_TIME       0.72      0.77      0.74        77
      I-IDENT

In [38]:
p = []
t = []
for i in predicted_tags_list:
  if i == 'O':
    p.append(i)
  else:
    p.append(i[2:])

for i in true_tags_list:
  if i == 'O':
    t.append(i)
  else:
    t.append(i[2:])

In [39]:
from sklearn.metrics import classification_report
print(classification_report(t, p))

                precision    recall  f1-score   support

         ASSET       0.53      0.36      0.43        45
   ATTACK_TYPE       0.68      0.57      0.62       127
      CAMPAIGN       0.86      0.88      0.87        42
     DATE_TIME       0.85      0.89      0.87       244
      IDENTITY       0.70      0.70      0.70      1256
     INDICATOR       0.54      0.23      0.32       127
INFRASTRUCTURE       0.56      0.47      0.51        77
      LOCATION       0.70      0.82      0.75       286
       MALWARE       0.80      0.84      0.82       210
             O       0.98      0.98      0.98     35809
  THREAT_ACTOR       0.60      0.74      0.66       104
          TOOL       0.58      0.58      0.58       187
 VULNERABILITY       1.00      0.42      0.59        26

      accuracy                           0.96     38540
     macro avg       0.72      0.65      0.67     38540
  weighted avg       0.96      0.96      0.96     38540



In [40]:
report = classification_report(t, p, digits=4, output_dict=True)
# Access the weighted F1 score, recall, and precision
f1_weighted = report['weighted avg']['f1-score']

recall_weighted = report['weighted avg']['recall']
precision_weighted = report['weighted avg']['precision']

# Print the results

print ('Weighted F1 Score: ', f1_weighted)
print ('Weighted Recall: ', recall_weighted)
print ('Weighted Precision: ', precision_weighted)

Weighted F1 Score:  0.9608426430992113
Weighted Recall:  0.9615204981837052
Weighted Precision:  0.9609476419100766


In [41]:
report = classification_report(t, p, digits=4, output_dict=True)
# Access the weighted F1 score, recall, and precision
f1_weighted = report['macro avg']['f1-score']

recall_weighted = report['macro avg']['recall']
precision_weighted = report['macro avg']['precision']

# Print the results

print ('Macro F1 Score: ', f1_weighted)
print ('Macro Recall: ', recall_weighted)
print ('Macro Precision: ', precision_weighted)

Macro F1 Score:  0.6704745239362088
Macro Recall:  0.6525347189744839
Macro Precision:  0.7221376189069535


# Case Study

In [None]:
import numpy as np

# Sample test sentence
test_sentence = "Google has agreed to pay $93 million to settle a lawsuit filed by the U.S. state of California over allegations that the company's location-privacy practices misled consumers and violated consumer protection laws."

# Preprocess the test sentence (similar to what you did during training)
preprocessed_test_sentence = preprecess_tokens(test_sentence.split())

# Reshape the preprocessed input to match the model's input shape
input_sequence = np.array(preprocessed_test_sentence)
input_sequence = np.expand_dims(input_sequence, axis=0)

# Predict the tags for the test sentence
predictions = model.predict(input_sequence)

# Decode the predictions to obtain the predicted tags
predicted_tags = np.argmax(predictions, axis=-1)[0]

# Inverse transform the predicted tags to get the original labels
predicted_tags = list(le2.inverse_transform(predicted_tags))

for token, label in zip(test_sentence.split(), predicted_tags):
    print("{:20}\t{}".format(token, label))

# Print the predicted tags for the test sentence
#print("Predicted Tags:", predicted_tags)

Google              	O
has                 	O
agreed              	O
to                  	O
pay                 	O
$93                 	O
million             	O
to                  	O
settle              	O
a                   	O
lawsuit             	O
filed               	O
by                  	O
the                 	O
U.S.                	B-LOCATION
state               	O
of                  	O
California          	O
over                	O
allegations         	O
that                	O
the                 	O
company's           	O
location-privacy    	O
practices           	O
misled              	O
consumers           	O
and                 	O
violated            	O
consumer            	O
protection          	O
laws.               	B-IDENTITY
