In [1]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import json
from transformers import BertTokenizer, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
import tensorflow as tf
from evaluation import mean_average_precision
from utils import batch_predict

print('Tensorflow Version: {}'.format(tf.__version__))
    
DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
DATA_PATH_FORMATED_TEST = '../data/GermanFakeNC_FORMATED_TEST.json'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
MODEL_PATH_BERT_TUNED = '../models/bert-base-german-cased-tuned/checkpoint.ckpt'
DATASET_DEV_SPLIT = 0.8
BATCH_SIZE = 32
LEARNING_RATE = 5e-5
BINACC_THRESHOLD = 0.5
EPOCHS = 1

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH_BERT)

def load_bert_model():
    cbert_model = TFBertForSequenceClassification.from_pretrained(MODEL_PATH_BERT)
    cbert_model.classifier.activation = tf.keras.activations.sigmoid
    return cbert_model

Tensorflow Version: 2.4.1


In [14]:
def read_data(path):
    with open(path) as json_file:
            return json.load(json_file)
        
def encode(sentences):
    return tokenizer(sentences, max_length=128, truncation=True, padding=True, return_tensors='tf')
        
def to_dataset(data):
    sentences = [d['org'] for d in data]
    encodings_ds = tf.data.Dataset.from_tensor_slices(encode(sentences))    
    encodings_ds = encodings_ds.map(lambda ex: {i:ex[i] for i in ex}) # Batch encoding to dictionary
    labels_ds = tf.data.Dataset.from_tensor_slices([d['lbl'] for d in data]).map(lambda lbl: tf.reshape(lbl, [1]))
    ids_ds = tf.data.Dataset.from_tensor_slices([d['article_id'] for d in data])
    return tf.data.Dataset.zip((ids_ds, encodings_ds, labels_ds))
        
train_data = read_data(DATA_PATH_FORMATED_TRAIN)
test_data = read_data(DATA_PATH_FORMATED_TEST)
    
train_ds = to_dataset(train_data).map(lambda ida, inp, lbl: (inp, lbl))
test_ds = to_dataset(test_data).map(lambda ida, inp, lbl: (ida, inp, lbl[0]))

num_train_examples = int(len(train_data) * DATASET_DEV_SPLIT)
train_ds_split = train_ds.take(num_train_examples).batch(BATCH_SIZE)
dev_ds_split = train_ds.skip(num_train_examples).batch(BATCH_SIZE)

In [None]:
def get_checkpoint_callback(model_path, monitor_value):
    return tf.keras.callbacks.ModelCheckpoint(model_path, 
                                              save_weights_only=True,
                                              monitor=monitor_value,
                                              verbose=1, 
                                              save_best_only=True,
                                              mode='max')

cbert_model = load_bert_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.BinaryCrossentropy()
metrics = [tf.keras.metrics.BinaryAccuracy(threshold=BINACC_THRESHOLD)]
cbert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED, 'val_binary_accuracy')
log_dir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

%tensorboard --logdir logs
history = cbert_model.fit(train_ds_split,
                epochs=EPOCHS,
                validation_data=dev_ds_split,
                callbacks=[checkpoint_callback, tensorboard_callback])

In [9]:
cbert_model = load_bert_model()
cbert_model.load_weights(MODEL_PATH_BERT_TUNED)

Some layers from the model checkpoint at ../models/bert-base-german-cased/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/bert-base-german-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f85b0f772b0>

In [15]:
def prediction_func(inps):
    outputs = cbert_model.predict(inps)
    return [l[0] for l in outputs.logits]

eval_data_bert = batch_predict(test_ds, 100, prediction_func)
print('BERT/MAP: {}'.format(mean_average_precision(eval_data_bert)))

BERT/MAP: 0.22873718863167605


### Results
|     | BERT | |
|-----|------|---------|
| MAP |   0.22873718863167605   |        |
| P@1 |      |         |