sources:
https://towardsdatascience.com/tensorflow-and-transformers-df6fceaf57cc

# Build Model

In [6]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # or any {'0', '1', '2'}
import tensorflow as tf
import pandas as pd
import numpy as np
import transformers
from transformers import TFAutoModel, AutoTokenizer
print(tf.__version__)
print(transformers.__version__)


tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
bert = TFAutoModel.from_pretrained("dbmdz/bert-base-german-uncased")

2.12.1
4.31.0


Some layers from the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-german-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
# input layers (has to be same structure as dataset)
SEQ_LEN = 50 
input_ids = tf.keras.layers.Input(shape=(SEQ_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(SEQ_LEN,), name='attention_mask', dtype='int32')

# add layers
embeddings = bert(input_ids, attention_mask=mask)[0]  # we only keep tensor 0 (last_hidden_state) of BERT
X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
layers = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(X)  # adjust based on number of classes

# Create model instance
model = tf.keras.Model(inputs=[input_ids, mask], outputs=layers)

#freeze BERT model
model.layers[2].trainable = False #BERT is already well trained and has a lot of Parameters

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  109927680   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                            

In [8]:
from tensorflow import metrics

# compile model
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
auprc = tf.keras.metrics.AUC(
    num_thresholds=200,
    curve='PR',
    summation_method='interpolation',
    name=None,
    dtype=None,
    thresholds=None,
    multi_label=False,
    num_labels=None,
    label_weights=None,
    from_logits=False
)
roc = tf.keras.metrics.AUC(
    num_thresholds=200,
    curve='ROC',
    summation_method='interpolation',
    name=None,
    dtype=None,
    thresholds=None,
    multi_label=False,
    num_labels=None,
    label_weights=None,
    from_logits=False
)



model.compile(optimizer=optimizer, loss=loss, metrics=[acc, auprc, roc])

# Tokenize AnnotationData and Create Datasets

In [9]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer

# own modules:
from bertconfig import tokenize, encode_values
from my_utils import load_hasoc

# Preprocess
## load and encode label
df = load_hasoc("data/hasoc_2020_de_train_new_a.xlsx")
## drop duplicates
df.drop_duplicates(subset="text", keep="first", inplace=True)
print(df.value_counts("label"))

# encode values
arr = df['label'].values  # label column in df -> array
labels = encode_values(arr) #-> makes [0,1] or [1,0] from 0 or 1

# tokenize comments
# set max token length of comment
SEQ_LEN = 50

# initialize two arrays for input tensors and loop through data and tokenize everything
all_ids = np.zeros((len(df), SEQ_LEN))
all_mask = np.zeros((len(df), SEQ_LEN))
for i, sentence in enumerate(df['text']):
    tokens = tokenize(sentence, tokenizer, SEQ_LEN)
    # append ID of every token in sentence:
    # append Mask (1 if valid word, 0 if padding)
    all_ids[i, :] = tokens['input_ids']
    all_mask[i, :] = tokens['attention_mask']

print(df['text'].iloc[1])
print(all_ids[0])

# create tensorflow dataset object
dataset = tf.data.Dataset.from_tensor_slices((all_ids, all_mask, labels))

# restructure dataset format for BERT
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

dataset = dataset.map(map_func)  # apply the mapping function % CREATE DATASET
print("length of dataset = {}".format(dataset.cardinality().numpy()))
print(type(dataset))

label
0    1700
1     673
Name: count, dtype: int64
RT @NDRinfo: Die deutsche Klimaaktivistin Luisa Neubauer wirft Kanzlerin Merkel wegen ihrer fehlenden Unterstützung für den europäischen Kl…
[  102.  1939. 13523.  7774.  4855. 25358. 30949. 20789.  1939.  1648.
 25358. 30949.   552.  1939.  1054.  1493. 25358.   296. 25358. 30949.
  2719.   142.   468.   524.   552.  6222.   847.  1061.  1061.   160.
   552.   928.  1061.  1138.  7774. 30948.  2828. 30972. 30964. 17800.
   103.     0.     0.     0.     0.     0.     0.     0.     0.     0.]
length of dataset = 2373
<class 'tensorflow.python.data.ops.map_op._MapDataset'>


# Train Model

In [10]:
# shuffle and batch the dataset
dataset_batched = dataset.shuffle(10000).batch(32) ## created _BatchDataset
DS_LEN = dataset_batched.cardinality().numpy()  # get dataset length
print("number of Batches dataset = {}".format(DS_LEN))

train_size = round(0.7 * DS_LEN)
val_size = round(0.15 * DS_LEN)
test_size = round(0.15 * DS_LEN)
test_dataset = dataset_batched.skip(train_size)

train_dataset = dataset_batched.take(train_size)
val_dataset = test_dataset.skip(val_size)
test_dataset = test_dataset.take(test_size)

print("number of Batches train_dataset = {}".format(train_dataset.cardinality().numpy()))
print("number of Batches val_dataset = {}".format(val_dataset.cardinality().numpy()))
print("number of Batches test_dataset = {}".format(test_dataset.cardinality().numpy()))


number of Batches dataset = 75
number of Batches train_dataset = 52
number of Batches val_dataset = 12
number of Batches test_dataset = 11


In [11]:
%load_ext tensorboard
from datetime import datetime
from packaging import version
from tensorflow import keras
import tensorboard

# Define the Keras TensorBoard callback.
logdir = "logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)


# train model (metrics: acc, auprc, roc)
history = model.fit(dataset_batched,
                    epochs=1,
                    validation_data=val_dataset,
                    callbacks=[tensorboard_callback])


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [43]:
print(model.history.history['val_auc_3'])

[0.43075522780418396]


In [17]:
!kill 1363

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 1363), started 0:02:47 ago. (Use '!kill 1363' to kill it.)