In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer

In [None]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, transformer, **kwargs):
        super().__init__(**kwargs)
        self.transformer = transformer
        self.hidden_size = self.transformer.config.hidden_size

    def call(self, input_ids, training=False):
        attention_mask = tf.ones_like(input_ids, dtype=tf.int32)
        outputs = self.transformer(input_ids=input_ids,
                                   attention_mask=attention_mask,
                                   training=training)
        return outputs.last_hidden_state

    def compute_output_shape(self, input_shape):
        return (input_shape[0],
                input_shape[1],
                self.hidden_size)

In [None]:
model_name = r'bert-base-cased'
transformer = TFAutoModel.from_pretrained(model_name, use_safetensors=True)
tokeniser = AutoTokenizer.from_pretrained(model_name)
max_length = 10
seq_length = 3
encoder = None
trainer = None

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
class_1 = [['Peter', 'Smith'], ['George', 'Peterson'], ['Arnold', 'Schwarzenegger']]
class_2 = [['Carter', 'Pete'], ['Pitts', 'Jacob'], ['Windsor', 'Mary']]
tags_1 = [['B-pre', 'B-sur'], ['B-pre', 'B-sur'], ['B-pre', 'B-sur']]
tags_2 = [['B-sur', 'B-pre'], ['B-sur', 'B-pre'], ['B-sur', 'B-pre']]
class_id = [0, 0, 0, 1, 1, 1]
df = pd.DataFrame(zip(class_1 + class_2, tags_1 + tags_2, class_id),
                  columns=['value', 'tag', 'class'])
print(df.head())

                      value             tag  class
0            [Peter, Smith]  [B-pre, B-sur]      0
1        [George, Peterson]  [B-pre, B-sur]      0
2  [Arnold, Schwarzenegger]  [B-pre, B-sur]      0
3            [Carter, Pete]  [B-sur, B-pre]      1
4            [Pitts, Jacob]  [B-sur, B-pre]      1


In [None]:
tags = list(set([i for j in df['tag'] for i in j])) + ['O']
number_of_tags = len(tags)
print(tags)

['B-sur', 'B-pre', 'O']


In [None]:
def one_hotify(lst):
  labels = []
  for i in lst:
    row = [0, 0, 0]
    row[tags.index(i)] = 1
    labels.append(row)
  return labels

In [None]:
dataset = tokeniser(list(df['value']),
                    padding='max_length',
                    truncation=True,
                    max_length=max_length,
                    is_split_into_words=True)
df['input_ids'] = dataset['input_ids']
df['word_ids'] = [dataset.word_ids(i) for i in range(len(df))]
df['labels'] = [['O' if i is None else t[i] for i in j] for j, t in zip(df['word_ids'], df['tag'])]
print(df.head())

                      value             tag  class  \
0            [Peter, Smith]  [B-pre, B-sur]      0   
1        [George, Peterson]  [B-pre, B-sur]      0   
2  [Arnold, Schwarzenegger]  [B-pre, B-sur]      0   
3            [Carter, Pete]  [B-sur, B-pre]      1   
4            [Pitts, Jacob]  [B-sur, B-pre]      1   

                                           input_ids  \
0           [101, 1943, 2159, 102, 0, 0, 0, 0, 0, 0]   
1          [101, 1667, 12092, 102, 0, 0, 0, 0, 0, 0]   
2  [101, 7296, 20452, 24156, 11819, 7582, 9146, 1...   
3           [101, 5007, 6377, 102, 0, 0, 0, 0, 0, 0]   
4       [101, 15877, 1116, 5549, 102, 0, 0, 0, 0, 0]   

                                            word_ids  \
0  [None, 0, 1, None, None, None, None, None, Non...   
1  [None, 0, 1, None, None, None, None, None, Non...   
2         [None, 0, 1, 1, 1, 1, 1, None, None, None]   
3  [None, 0, 1, None, None, None, None, None, Non...   
4  [None, 0, 0, 1, None, None, None, None, None, ...   

 

In [None]:
def rolling_windows(lst, n):
    return [lst[i:i + n] + lst[:max(0, i + n - len(lst))] for i in range(len(lst))]

In [None]:
values = []
labels = []
for class_id in set(df['class']):
  subset = df[df['class'] == class_id]
  values += rolling_windows(list(subset['input_ids']), seq_length)
  labels += rolling_windows(list(subset['labels']), seq_length)
labels = [[one_hotify(i) for i in j] for j in labels]
values, labels = np.array(values, dtype=int), np.array(labels, dtype=int)
labels = labels[:, -1, :, :]
print(values.shape, labels.shape)
print(values[0], labels[0])

(6, 3, 10) (6, 10, 3)
[[  101  1943  2159   102     0     0     0     0     0     0]
 [  101  1667 12092   102     0     0     0     0     0     0]
 [  101  7296 20452 24156 11819  7582  9146   102     0     0]] [[0 0 1]
 [0 1 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 1]
 [0 0 1]
 [0 0 1]]


In [None]:
tokenised_input = tf.keras.layers.Input(shape=(seq_length, max_length), dtype=tf.int32, name='input_ids')
timeseries = tf.keras.layers.TimeDistributed(TransformerEncoder(transformer), name='transformer')(tokenised_input)
reshaper = tf.keras.layers.Reshape((3, 7680))(timeseries)
recurrent = tf.keras.layers.LSTM(768, name='recurrent')(reshaper)
sigmoid = tf.keras.layers.Dense(max_length * number_of_tags, name='dense', activation='sigmoid')(recurrent)
predictor = tf.keras.layers.Reshape((max_length, number_of_tags), name='shaper')(sigmoid)

In [None]:
model = tf.keras.Model(inputs=tokenised_input,
                       outputs=predictor)

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(x=values,
          y=labels,
          batch_size=32,
          epochs=10,
          validation_split=0.1)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 315ms/step - accuracy: 0.8400 - loss: 0.2266 - val_accuracy: 0.8000 - val_loss: 0.3372
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 533ms/step - accuracy: 1.0000 - loss: 0.1374 - val_accuracy: 0.8000 - val_loss: 0.3523
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step - accuracy: 1.0000 - loss: 0.0964 - val_accuracy: 0.8000 - val_loss: 0.3662
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367ms/step - accuracy: 1.0000 - loss: 0.0745 - val_accuracy: 0.8000 - val_loss: 0.3838
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step - accuracy: 1.0000 - loss: 0.0574 - val_accuracy: 0.8000 - val_loss: 0.4092
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step - accuracy: 1.0000 - loss: 0.0490 - val_accuracy: 0.8000 - val_loss: 0.4369
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e1eb68d8710>