In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
tf.__version__

'2.9.1'

In [4]:
from datasets import load_dataset, load_from_disk
data = load_dataset('conll2003')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset conll2003 (C:/Users/sajit/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 125.00it/s]


In [5]:
data['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [7]:
oov_token = '[UNK]'
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token, lower=True)
tokenizer.fit_on_texts([' '.join(x) for x in data['train']['tokens']])
len(tokenizer.word_index)


17680

In [8]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
  tokenized_inputs = [tokenizer.texts_to_sequences(token) for token in examples['tokens']]
  new_tokenized_inputs = []
  labels = []
  word_ids_list = []
  for i,tokenized_input in enumerate(tokenized_inputs):
    ner_tags = examples['ner_tags'][i]
    label_ids = []
    word_ids = []
    tokenized_sentence = []
    for j,tokenized_words in enumerate(tokenized_input):
      if tokenized_words:
        tokenized_sentence.extend(tokenized_words)
        word_ids.extend([j]*len(tokenized_words))
        label_ids.append(ner_tags[j])
        for k in range(len(tokenized_words)-1):
          label_ids.append(ner_tags[j] if label_all_tokens else -1)
    labels.append(label_ids)
    word_ids_list.append(word_ids)
    new_tokenized_inputs.append(tokenized_sentence)
  return {'input_ids': new_tokenized_inputs, 'word_ids': word_ids_list, 'labels': labels}


In [9]:
def input_and_label_pad_sequence(examples, maxlen=100):
  return { 'input_ids': tf.keras.preprocessing.sequence.pad_sequences(examples['input_ids'],maxlen = maxlen, padding='post', truncating='post'),
           'labels': tf.keras.preprocessing.sequence.pad_sequences(examples['labels'],maxlen = maxlen, padding='post', truncating='post',value=-1 )}

In [10]:
def create_padding_mask(examples):
    seq = 1 - (np.array(examples['input_ids'])==0)
    return {"attention_mask": seq}

In [11]:
maxlen = 100

data = data.map(tokenize_and_align_labels, batched=True).map(input_and_label_pad_sequence, batched=True).map(create_padding_mask, batched=True)

                                                                    

In [92]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'word_ids', 'labels', 'attention_mask'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'word_ids', 'labels', 'attention_mask'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'word_ids', 'labels', 'attention_mask'],
        num_rows: 3453
    })
})

In [12]:
train = tf.data.Dataset.from_tensor_slices((data['train']['input_ids'],data['train']['attention_mask'],data['train']['labels'])).map(lambda x,y,z: ((x,y),z)).shuffle(10000).batch(64)
validation = tf.data.Dataset.from_tensor_slices((data['validation']['input_ids'],data['validation']['attention_mask'],data['validation']['labels'])).map(lambda x,y,z: ((x,y),z)).batch(64)
test = tf.data.Dataset.from_tensor_slices((data['test']['input_ids'],data['test']['attention_mask'],data['test']['labels'])).map(lambda x,y,z: ((x,y),z)).batch(64)


In [13]:
# @tf.keras.utils.register_keras_serializable()
def positional_encoding(positions, d_model):

    position = np.arange(positions)[:, np.newaxis]
    k = np.arange(d_model)[np.newaxis, :]
    i = k // 2

    # initialize a matrix angle_rads of all the angles
    angle_rates = 1 / np.power(10000, (2 * i) / np.float32(d_model))
    angle_rads = position * angle_rates

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [14]:
# @tf.keras.utils.register_keras_serializable()
def create_padding_mask(decoder_token_ids):

    seq = 1 - tf.cast(tf.math.equal(decoder_token_ids, 0), tf.float32)
    return seq[:, tf.newaxis, :]

In [15]:
# @tf.keras.utils.register_keras_serializable()
def FullyConnected(embedding_dim, fully_connected_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(fully_connected_dim, activation='relu'),
      tf.keras.layers.Dense(embedding_dim)
  ])

In [16]:
# @tf.keras.utils.register_keras_serializable()
class EncoderLayer(tf.keras.layers.Layer):

  def __init__(self, embedding_dim, num_heads, fully_connected_dim,
               dropout_rate=0.1, layernorm_eps=1e-6):

    super(EncoderLayer, self).__init__()

    self.mha = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embedding_dim,
        dropout=dropout_rate
    )

    self.ffn = FullyConnected(
        embedding_dim=embedding_dim,
        fully_connected_dim=fully_connected_dim
    )

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)

    self.dropout_ffn = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):

    self_mha_output = self.mha(x,x,x,mask)
    skip_x_attention = self.layernorm1(x+self_mha_output)
    ffn_output = self.ffn(skip_x_attention)
    ffn_output = self.dropout_ffn(ffn_output, training=training)
    encoder_layer_out = self.layernorm2(skip_x_attention+ffn_output)

    return encoder_layer_out

In [17]:
# @tf.keras.utils.register_keras_serializable()
class Encoder(tf.keras.layers.Layer):

  def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
    super(Encoder, self).__init__()

    self.embedding_dim = embedding_dim

    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.pos_encoding = positional_encoding(maximum_position_encoding,
                                            self.embedding_dim)

    self.enc_layers = [EncoderLayer(embedding_dim=embedding_dim,
                                    num_heads=num_heads,
                                    fully_connected_dim=fully_connected_dim,
                                    dropout_rate=dropout_rate,
                                    layernorm_eps=layernorm_eps
                                    )
                      for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):

     seq_len = tf.shape(x)[1]

     x = self.embedding(x)

     x*=tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))

     x+=self.pos_encoding[:,:seq_len,:]

     x = self.dropout(x, training=training)

     for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

     return x

In [18]:
# @tf.keras.utils.register_keras_serializable()
class NERModel(tf.keras.Model):

  def __init__(self, num_tags, num_layers, embedding_dim, num_heads, fully_connected_dim,
               vocab_size, max_positional_encoding,dropout_rate=0.1,
               layernorm_eps=1e-6):

    super(NERModel, self).__init__()

    self.encoder = Encoder(     num_layers=num_layers,
                                embedding_dim=embedding_dim,
                                num_heads=num_heads,
                                fully_connected_dim=fully_connected_dim,
                                vocab_size=vocab_size,
                                maximum_position_encoding=max_positional_encoding,
                                dropout_rate=dropout_rate,
                                layernorm_eps=layernorm_eps)
    self.dropout1 = tf.keras.layers.Dropout(rate=dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(rate=dropout_rate)
    self.ffn = tf.keras.layers.Dense(fully_connected_dim, activation='relu')
    self.ffn_final = tf.keras.layers.Dense(num_tags,activation='softmax')

  def call(self, x, training):
    x, mask = x
    mask = mask[:,tf.newaxis,:]
    x = self.encoder(x,training, mask)
    x = self.dropout1(x, training = training)
    x = self.ffn(x)
    x = self.dropout2(x, training=training)
    x = self.ffn_final(x)
    return x




In [19]:
data['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [20]:
len(tokenizer.word_index)

17680

In [29]:
num_layers = 2
num_tags = 9
vocab_size = len(tokenizer.word_index)+1
embedding_dim = 100
fully_connected_dim = 100
num_heads = 3
positional_encoding_length = 150

transformer = NERModel(
    num_tags,
    num_layers,
    embedding_dim,
    num_heads,
    fully_connected_dim,
    vocab_size,
    positional_encoding_length,
)

In [22]:
class CustomNonPaddingTokenLoss(tf.keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

    def call(self, y_true, y_pred):
        mask = tf.math.logical_not(tf.math.equal(y_true, -1))
        mask = tf.cast(mask, dtype=tf.int32)

        loss = self.loss_object(y_true*mask, y_pred)

        mask = tf.cast(mask, dtype=loss.dtype)
        loss *= mask

        return tf.reduce_sum(loss)/tf.reduce_sum(mask)




In [23]:
def acc(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, -1))
    mask = tf.cast(mask,dtype=tf.float32)
    pred = tf.cast(tf.argmax(y_pred,axis=-1),dtype=tf.float32)
    correct = tf.cast(tf.equal(y_true,pred),dtype=tf.float32)
    correct*=mask
    return tf.reduce_sum(correct)/tf.reduce_sum(mask)


In [24]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, dtype=tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(embedding_dim)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [30]:
transformer.compile(optimizer=optimizer, loss=[CustomNonPaddingTokenLoss()], metrics=[acc])

In [31]:
transformer.fit(train,
                validation_data=(validation),
                epochs = 5)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2299d3a5608>

In [32]:
transformer.evaluate(test)



[0.44282063841819763, 0.905369222164154]

In [41]:
example = data['test'][22]
example['tokens'],example['ner_tags'],example['word_ids'],example['labels'][:len(example['word_ids'])]

(['RUGBY',
  'UNION',
  '-',
  'CUTTITTA',
  'BACK',
  'FOR',
  'ITALY',
  'AFTER',
  'A',
  'YEAR',
  '.'],
 [3, 4, 0, 1, 0, 0, 5, 0, 0, 0, 0],
 [0, 1, 3, 4, 5, 6, 7, 8, 9],
 [3, 4, 1, 0, 0, 5, 0, 0, 0])

In [42]:
[np.argmax(x) for x in transformer.predict(test)[22]]



[3,
 4,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 7,
 7,
 7,
 3,
 7,
 0,
 0,
 0,
 0,
 0,
 0]

In [78]:
transformer.save_weights('model')

In [80]:
transformer.get_weights()

[array([[ 0.0136357 , -0.02332625,  0.0214308 , ...,  0.04617471,
         -0.02629803,  0.0284049 ],
        [-0.02982886,  0.0492092 ,  0.04061327, ...,  0.03996316,
          0.0274912 ,  0.03493016],
        [ 0.01809391, -0.02298428, -0.01594126, ..., -0.02994744,
          0.03714817,  0.02254997],
        ...,
        [-0.02571332, -0.00714546, -0.00433973, ..., -0.0237089 ,
          0.02439562, -0.00072109],
        [ 0.04601332,  0.00329397, -0.01782915, ...,  0.01124864,
          0.00573756,  0.0313151 ],
        [ 0.00023881, -0.03073721, -0.0072296 , ..., -0.00128248,
         -0.00013477,  0.01557428]], dtype=float32),
 array([[[-2.36248951e-02, -1.24558154e-02,  9.21329018e-03, ...,
           2.74227280e-03, -1.65734906e-02, -7.17458874e-03],
         [ 1.85870137e-02, -1.88012496e-02, -1.31710398e-03, ...,
           9.61593410e-04,  9.93849337e-03, -4.63680038e-03],
         [ 6.21933583e-03, -5.47892600e-03, -7.16434745e-03, ...,
           6.39675511e-03,  5.272646

In [40]:
dummy = NERModel(
    num_tags,
    num_layers,
    embedding_dim,
    num_heads,
    fully_connected_dim,
    vocab_size,
    positional_encoding_length,

)
dummy.load_weights('model')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2cc55bd8908>

In [28]:
tf.keras.models.load_model('model')





ValueError: Unable to create a Keras model from SavedModel at model. This SavedModel was exported with `tf.saved_model.save`, and lacks the Keras metadata file. Please save your Keras model by calling `model.save`or `tf.keras.models.save_model`. Note that you can still load this SavedModel with `tf.saved_model.load`.

In [41]:
[np.argmax(x) for x in dummy(np.array([example['input_ids']]))[0]][:len(example['word_ids'])]

[1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1]