In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
tf.__version__

'2.10.0'

In [2]:
from datasets import load_dataset, load_from_disk
data = load_dataset('conll2003')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [168]:
oov_token = '[UNK]'
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token, lower=True)
tokenizer.fit_on_texts([' '.join(x) for x in data['train']['tokens']])
len(tokenizer.word_index)


17682

In [172]:
tokenizer.texts_to_sequences(["Russia's is the best".split(' ')])

[[1, 30, 2, 517]]

In [5]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
  tokenized_inputs = [tokenizer.texts_to_sequences(token) for token in examples['tokens']]
  new_tokenized_inputs = []
  labels = []
  word_ids_list = []
  for i,tokenized_input in enumerate(tokenized_inputs):
    ner_tags = examples['ner_tags'][i]
    label_ids = []
    word_ids = []
    tokenized_sentence = []
    for j,tokenized_words in enumerate(tokenized_input):
      if tokenized_words:
        tokenized_sentence.extend(tokenized_words)
        word_ids.extend([j]*len(tokenized_words))
        label_ids.append(ner_tags[j])
        for k in range(len(tokenized_words)-1):
          label_ids.append(ner_tags[j] if label_all_tokens else -1)
    labels.append(label_ids)
    word_ids_list.append(word_ids)
    new_tokenized_inputs.append(tokenized_sentence)
  return {'input_ids': new_tokenized_inputs, 'word_ids': word_ids_list, 'labels': labels}


In [6]:
def input_and_label_pad_sequence(examples, maxlen=100):
  return { 'input_ids': tf.keras.preprocessing.sequence.pad_sequences(examples['input_ids'],maxlen = maxlen, padding='post', truncating='post'),
           'labels': tf.keras.preprocessing.sequence.pad_sequences(examples['labels'],maxlen = maxlen, padding='post', truncating='post',value=-1 )}

In [9]:
def create_padding_mask(examples):
    seq = 1 - (np.array(examples['input_ids'])==0)
    return {"attention_mask": seq}

In [10]:
maxlen = 100

data = data.map(tokenize_and_align_labels, batched=True).map(input_and_label_pad_sequence, batched=True).map(create_padding_mask, batched=True)

                                                                    

In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'word_ids', 'labels', 'attention_mask'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'word_ids', 'labels', 'attention_mask'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'word_ids', 'labels', 'attention_mask'],
        num_rows: 3453
    })
})

In [86]:
train = tf.data.Dataset.from_tensor_slices((data['train']['input_ids'],data['train']['attention_mask'],data['train']['labels'])).map(lambda x,y,z: (tf.concat((x,y),axis=-1),z)).shuffle(10000).batch(64)
validation = tf.data.Dataset.from_tensor_slices((data['validation']['input_ids'],data['validation']['attention_mask'],data['validation']['labels'])).map(lambda x,y,z: (tf.concat((x,y),axis=-1),z)).batch(64)
test = tf.data.Dataset.from_tensor_slices((data['test']['input_ids'],data['test']['attention_mask'])).map(lambda x,y: tf.concat((x,y),axis=-1)).batch(64)

In [93]:
transformer.predict(test)



array([[[8.95207465e-01, 1.82132777e-02, 5.95899764e-03, ...,
         2.62658554e-03, 1.26864361e-02, 3.54189542e-03],
        [3.62532400e-02, 2.01158792e-01, 1.39169052e-01, ...,
         3.42726000e-02, 3.18115242e-02, 1.52590610e-02],
        [8.64837825e-01, 4.59051467e-02, 1.70826111e-02, ...,
         4.34076972e-03, 9.14051570e-03, 6.25726022e-03],
        ...,
        [9.43994641e-01, 1.05073247e-02, 1.17700137e-02, ...,
         2.51220190e-03, 2.92870705e-03, 6.11473108e-03],
        [9.30240035e-01, 1.40919713e-02, 1.41257774e-02, ...,
         2.88459356e-03, 3.63089913e-03, 8.22029542e-03],
        [9.00064468e-01, 2.46056076e-02, 2.10508071e-02, ...,
         3.69378296e-03, 5.13632968e-03, 1.21017955e-02]],

       [[6.47409707e-02, 1.32310107e-01, 6.47920966e-02, ...,
         4.96314056e-02, 3.96114588e-02, 3.51920649e-02],
        [6.40925243e-02, 1.87245995e-01, 8.35430771e-02, ...,
         4.44083698e-02, 4.11222540e-02, 4.45025414e-02],
        [9.89445001e-02, 

In [7]:
# @tf.keras.utils.register_keras_serializable()
def positional_encoding(positions, d_model):

    position = np.arange(positions)[:, np.newaxis]
    k = np.arange(d_model)[np.newaxis, :]
    i = k // 2

    # initialize a matrix angle_rads of all the angles
    angle_rates = 1 / np.power(10000, (2 * i) / np.float32(d_model))
    angle_rads = position * angle_rates

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [8]:
# @tf.keras.utils.register_keras_serializable()
def create_padding_mask(decoder_token_ids):

    seq = 1 - tf.cast(tf.math.equal(decoder_token_ids, 0), tf.float32)
    return seq[:, tf.newaxis, :]

In [9]:
# @tf.keras.utils.register_keras_serializable()
def FullyConnected(embedding_dim, fully_connected_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(fully_connected_dim, activation='relu'),
      tf.keras.layers.Dense(embedding_dim)
  ])

In [10]:
# @tf.keras.utils.register_keras_serializable()
class EncoderLayer(tf.keras.layers.Layer):

  def __init__(self, embedding_dim, num_heads, fully_connected_dim,
               dropout_rate=0.1, layernorm_eps=1e-6):

    super(EncoderLayer, self).__init__()

    self.mha = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embedding_dim,
        dropout=dropout_rate
    )

    self.ffn = FullyConnected(
        embedding_dim=embedding_dim,
        fully_connected_dim=fully_connected_dim
    )

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)

    self.dropout_ffn = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):

    self_mha_output = self.mha(x,x,x,mask)
    skip_x_attention = self.layernorm1(x+self_mha_output)
    ffn_output = self.ffn(skip_x_attention)
    ffn_output = self.dropout_ffn(ffn_output, training=training)
    encoder_layer_out = self.layernorm2(skip_x_attention+ffn_output)

    return encoder_layer_out

In [11]:
# @tf.keras.utils.register_keras_serializable()
class Encoder(tf.keras.layers.Layer):

  def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
    super(Encoder, self).__init__()

    self.embedding_dim = embedding_dim

    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.pos_encoding = positional_encoding(maximum_position_encoding,
                                            self.embedding_dim)

    self.enc_layers = [EncoderLayer(embedding_dim=embedding_dim,
                                    num_heads=num_heads,
                                    fully_connected_dim=fully_connected_dim,
                                    dropout_rate=dropout_rate,
                                    layernorm_eps=layernorm_eps
                                    )
                      for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):

     seq_len = tf.shape(x)[1]

     x = self.embedding(x)

     x*=tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))

     x+=self.pos_encoding[:,:seq_len,:]

     x = self.dropout(x, training=training)

     for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

     return x

In [12]:
# @tf.keras.utils.register_keras_serializable()
class NERModel(tf.keras.Model):

  def __init__(self, num_tags, num_layers, embedding_dim, num_heads, fully_connected_dim,
               vocab_size, max_positional_encoding,dropout_rate=0.1,
               layernorm_eps=1e-6):

    super(NERModel, self).__init__()

    self.encoder = Encoder(     num_layers=num_layers,
                                embedding_dim=embedding_dim,
                                num_heads=num_heads,
                                fully_connected_dim=fully_connected_dim,
                                vocab_size=vocab_size,
                                maximum_position_encoding=max_positional_encoding,
                                dropout_rate=dropout_rate,
                                layernorm_eps=layernorm_eps)
    self.dropout1 = tf.keras.layers.Dropout(rate=dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(rate=dropout_rate)
    self.ffn = tf.keras.layers.Dense(fully_connected_dim, activation='relu')
    self.ffn_final = tf.keras.layers.Dense(num_tags,activation='softmax')

  def call(self, x, training):
    seq_length = tf.shape(x)[-1]//2
    mask = x[:,seq_length:]
    mask = mask[:,tf.newaxis,:]
    x = x[:,:seq_length]
    x = self.encoder(x,training, mask)
    x = self.dropout1(x, training = training)
    x = self.ffn(x)
    x = self.dropout2(x, training=training)
    x = self.ffn_final(x)
    return x




In [24]:
data['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [20]:
len(tokenizer.word_index)

17680

In [16]:
num_layers = 2
num_tags = 9
vocab_size = len(tokenizer.word_index)+1
embedding_dim = 100
fully_connected_dim = 100
num_heads = 3
positional_encoding_length = 150

# transformer = NERModel(
#     num_tags,
#     num_layers,
#     embedding_dim,
#     num_heads,
#     fully_connected_dim,
#     vocab_size,
#     positional_encoding_length,
# )

In [22]:
class MaskedLoss(tf.keras.losses.Loss):
    def __init__(self, name="custom_ner_loss",**kwargs):
        super().__init__(name=name,**kwargs)
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

    def call(self, y_true, y_pred):
        mask = tf.math.logical_not(tf.math.equal(y_true, -1))
        mask = tf.cast(mask, dtype=tf.int32)

        loss = self.loss_object(y_true*mask, y_pred)

        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask

        return tf.reduce_sum(loss)/tf.reduce_sum(mask)




In [23]:
def masked_acc(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, -1))
    mask = tf.cast(mask,dtype=tf.float32)
    pred = tf.cast(tf.argmax(y_pred,axis=-1),dtype=tf.float32)
    correct = tf.cast(tf.equal(y_true,pred),dtype=tf.float32)
    correct*=mask
    return tf.reduce_sum(correct)/tf.reduce_sum(mask)


In [17]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, dtype=tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(embedding_dim)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [84]:
transformer.compile(optimizer=optimizer, loss=[MaskedLoss()], metrics=[masked_acc])

In [85]:
transformer.fit(train,
                validation_data=(validation),
                epochs = 10)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [98]:
transformer(test)

ValueError: Exception encountered when calling layer "ner_model_4" "                 f"(type NERModel).

Attempt to convert a value (<BatchDataset element_spec=TensorSpec(shape=(None, 200), dtype=tf.int32, name=None)>) with an unsupported type (<class 'tensorflow.python.data.ops.dataset_ops.BatchDataset'>) to a Tensor.

Call arguments received by layer "ner_model_4" "                 f"(type NERModel):
  • x=<BatchDataset element_spec=TensorSpec(shape=(None, 200), dtype=tf.int32, name=None)>
  • training=None

In [41]:
example = data['test'][22]
example['tokens'],example['ner_tags'],example['word_ids'],example['labels'][:len(example['word_ids'])]

(['RUGBY',
  'UNION',
  '-',
  'CUTTITTA',
  'BACK',
  'FOR',
  'ITALY',
  'AFTER',
  'A',
  'YEAR',
  '.'],
 [3, 4, 0, 1, 0, 0, 5, 0, 0, 0, 0],
 [0, 1, 3, 4, 5, 6, 7, 8, 9],
 [3, 4, 1, 0, 0, 5, 0, 0, 0])

In [42]:
[np.argmax(x) for x in transformer.predict(test)[22]]



[3,
 4,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 7,
 7,
 7,
 3,
 7,
 0,
 0,
 0,
 0,
 0,
 0]

In [34]:
label_list = data['train'].features['ner_tags'].feature.names

In [37]:
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [75]:
from sklearn.metrics import precision_score, recall_score, f1_score 

def compute_metrics(labels, pred_logits):

    pred_logits = np.argmax(pred_logits, axis=-1)

    predictions = []
    for prediction, label in zip(pred_logits, labels):
        for eval_pred, l in zip(prediction, label):
            if l!= -1:
                predictions.append(eval_pred)

    true_labels = []
    for prediction, label in zip(pred_logits, labels):
        for eval_pred, l in zip(prediction, label):
            if l!= -1:
                true_labels.append(l)

    precision = precision_score(true_labels, predictions, average='macro')
    recall = recall_score(true_labels, predictions, average='macro')
    f1 = f1_score(true_labels, predictions, average='macro')
    
    return {'precision': precision,
            'recall': recall,
            'f1_score': f1 }


In [74]:
labels = data['test']['labels']
pred_logits = transformer.predict(test)

compute_metrics((pred_logits,labels))



{'precision': 0.7354912726458616,
 'recall': 0.6560956424169582,
 'f1_score': 0.6889829013166076}

In [34]:
import pickle
from pathlib import Path

In [26]:
model = tf.keras.models.load_model('artifacts\model_trainer\model.keras', custom_objects={
            'NERModel':NERModel,
            'MaskedLoss':MaskedLoss,
            'masked_acc': masked_acc,
            'CustomSchedule': CustomSchedule
        })

In [51]:
from named_entity_recognition.config.configuration import ConfigurationManager

In [166]:
class PredictionPipeline:
    def __init__(self):
        manager = ConfigurationManager()
        self.config = manager.get_data_transformation_config()
        self.model_path = manager.get_model_evaluation_config().model_path
        self.tags = {1:"Person", 3:"Organization", 5: "Location", 7: "Miscellaneous"}
    def predict(self, sentence):
        model = self.get_model()
        input_ids, word_ids = self.get_input_ids_and_word_ids(sentence)
        input_ids = self.pad_and_mask([input_ids])
        labels = tf.argmax(model.predict(input_ids),axis=-1).numpy().tolist()[0]
        labels = self.align_labels(word_ids,labels)
        return self.get_entities(sentence,labels)

    def get_entities(self, sentence, labels):
        sentence = sentence.split(' ')
        entities = []
        current_span = ''
        prev_label = 0
        for word, label in zip(sentence, labels):
            if label == 0 and current_span:
                entities.append([current_span, self.tags[prev_label]])
                current_span = ''
                prev_label = 0
            elif label in {1,3,5,7}:
                if current_span:
                    entities.append([current_span, self.tags[prev_label]])
                current_span = word
                prev_label = label
            elif prev_label and label in {2,4,6,8}:
                current_span += ' ' + word
        if current_span:
            entities.append([current_span, self.tags[prev_label]])
        return entities

    def align_labels(self, word_ids, labels):
        aligned_labels = [0]*(max(word_ids)+1)
        for word_id, label in zip(word_ids, labels):
            if aligned_labels[word_id] == 0 and label:
                aligned_labels[word_id] = label
        return aligned_labels
    
    def pad_and_mask(self, input_ids):
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=self.config.params_max_sequence_length, padding='post', truncating='post')
        mask = tf.cast(tf.math.not_equal(input_ids, 0), tf.int32)
        input = tf.concat((input_ids,mask),axis=-1)
        return input
        
    def get_input_ids_and_word_ids(self, sentence):
        tokenizer = self.get_tokenizer()
        tokens = tokenizer.texts_to_sequences(sentence.split(' '))
        print(tokens)
        input_ids = []
        word_ids = []
        
        for index, token in enumerate(tokens):
            for sub_token in token:
                if sub_token:
                    input_ids.append(sub_token)
                    word_ids.append(index)
        return input_ids, word_ids


    def get_tokenizer(self):
        with open(self.config.tokenizer_path,'rb') as f:
            tokenizer = pickle.load(f) 
        return tokenizer
    
    def get_model(self):
        model = tf.keras.models.load_model(self.model_path, custom_objects={
            'NERModel':NERModel,
            'MaskedLoss':MaskedLoss,
            'masked_acc': masked_acc,
            'CustomSchedule': CustomSchedule
        })
        return model



In [167]:
PredictionPipeline().predict("Ukraine launched drone attacks on Russia's Kushchevsk military airfield in the southern Krasnodar region, as well as two oil refineries, a source with knowledge of the operation told CNN.")

[2024-04-29 02:17:26,591: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-29 02:17:26,594: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-29 02:17:26,594: INFO: common: created directory at: artifacts]
[2024-04-29 02:17:26,597: INFO: common: created directory at: artifacts/data_transformation]
[2024-04-29 02:17:26,598: INFO: common: created directory at: artifacts/model_evaluation]
[[636], [1739], [1], [993], [9], [1], [1], [392], [1], [4], [2], [457], [1], [639], [31], [285], [31], [49], [502], [1], [6], [987], [22], [5788], [3], [2], [1301], [94], [7269]]


[['Ukraine', 'Location'], ['CNN.', 'Miscellaneous']]