In [1]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import datetime
from transformers import BertTokenizer, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense, Lambda
from tensorflow.keras import backend as K
import tensorflow as tf
from evaluation import mean_average_precision, precision_at_k
from utils import *
from model import *

print('Tensorflow Version: {}'.format(tf.__version__))
# Load the TensorBoard notebook extension
%load_ext tensorboard
    
DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
DATA_PATH_FORMATED_TEST = '../data/GermanFakeNC_FORMATED_TEST.json'
DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
MODEL_PATH_BERT_TUNED = '../models/bert-base-german-cased-tuned/checkpoint.ckpt'
MODEL_PATH_BERT_TUNED_RANKING = '../models/bert-base-german-cased-tuned-ranking/checkpoint.ckpt'
MODEL_PATH_BERT_TUNED_TRUENEWS = '../models/bert-base-german-cased-tuned-truenews/checkpoint.ckpt'
DATASET_SIZE = 14765
DATASET_DEV_SPLIT = 0.8
NUM_SAMPLING_CANDIDATES = 5
BATCH_SIZE = 32
MAX_LEN = 134
LEARNING_RATE = 5e-5
BINACC_THRESHOLD = 0.1
PRECISION_RECALL_THRESHOLDS = [0.05, 0.1, 0.2, 0.5]
EPOCHS = 5

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH_BERT)

Tensorflow Version: 2.4.1


In [2]:
bert_feature = tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64)

def input_parser_train(example):
    feature_description = {'input_ids': bert_feature,
                           'token_type_ids': bert_feature,
                           'attention_mask': bert_feature,
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    inp = {'input_ids': parsed['input_ids'],
           'token_type_ids': parsed['token_type_ids'],
           'attention_mask': parsed['attention_mask'],}
    return (inp, parsed['y'])

def input_parser_test(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64),
                           'input_ids': bert_feature,
                           'token_type_ids': bert_feature,
                           'attention_mask': bert_feature,
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    inp = {'input_ids': parsed['input_ids'],
           'token_type_ids': parsed['token_type_ids'],
           'attention_mask': parsed['attention_mask']}
    return (parsed['article_id'][0], inp, parsed['y'][0])

def input_parser_cs(example):
    feature_description = {'input_ids1': bert_feature,
                           'token_type_ids1': bert_feature,
                           'attention_mask1': bert_feature,
                           'input_ids2': bert_feature,
                           'token_type_ids2': bert_feature,
                           'attention_mask2': bert_feature,
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    inp = {'input_ids1': parsed['input_ids1'],
           'token_type_ids1': parsed['token_type_ids1'],
           'attention_mask1': parsed['attention_mask1'],
           'input_ids2': parsed['input_ids2'],
           'token_type_ids2': parsed['token_type_ids2'],
           'attention_mask2': parsed['attention_mask2']}
    return (inp, parsed['y'])

def format_ranking_dataset(dataset):    
    train_dataset_size = int(DATASET_SIZE * NUM_SAMPLING_CANDIDATES * DATASET_DEV_SPLIT)
    train_dataset = train_sampling_dataset.map(lambda inp, y: (inp, {'out_s1': y,'out_diff': y}))
    # use half the batch size because of memory concerns
    train_dataset_split = train_sampling_dataset.take(train_dataset_size).batch(int(BATCH_SIZE / 2)).prefetch(1)
    dev_dataset = train_sampling_dataset.skip(train_dataset_size).batch(BATCH_SIZE)    
    return train_dataset_split, dev_dataset

train_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TRAIN_BERT_BASE', input_parser_train)
train_sampling_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TRAIN_BERT_SAMPLING', input_parser_cs)
train_truenews_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TRAIN_BERT_TRUENEWS', input_parser_cs)
test_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TEST_BERT_BASE', input_parser_test)

num_train_examples = int(DATASET_SIZE * DATASET_DEV_SPLIT)
train_ds_split = train_dataset.take(num_train_examples)
train_ds_split = train_ds_split.shuffle(100, reshuffle_each_iteration=True).batch(BATCH_SIZE)
dev_ds_split = train_dataset.skip(num_train_examples).batch(BATCH_SIZE)

train_sampling_dataset_split, dev_sampling_dataset = format_ranking_dataset(train_sampling_dataset)
train_truenews_dataset_split, dev_truenews_dataset = format_ranking_dataset(train_sampling_dataset)

### Load base model

In [None]:
cbert_model = load_bert_model(MODEL_PATH_BERT)

### Load ranking model

In [5]:
cbert_model = load_bert_model(MODEL_PATH_BERT)

def cbert_model_forward(inp):
    outputs = cbert_model(inp)
    return outputs.logits

shape=(MAX_LEN,)
input_type=tf.int32

input_ids1 = Input(shape=shape, name='input_ids1', dtype=input_type)
attention_mask1 = Input(shape=shape, name='attention_mask1', dtype=input_type)
token_type_ids1 = Input(shape=shape, name='token_type_ids1', dtype=input_type)

input_ids2 = Input(shape=shape, name='input_ids2', dtype=input_type)
attention_mask2 = Input(shape=shape, name='attention_mask2', dtype=input_type)
token_type_ids2 = Input(shape=shape, name='token_type_ids2', dtype=input_type) 

cbert_model_ranking = build_ranking_model(cbert_model_forward,
                                          [input_ids1, attention_mask1, token_type_ids1],
                                          [input_ids2, attention_mask2, token_type_ids2])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/bert-base-german-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


### Training without ranking

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.BinaryCrossentropy()
precision = tf.keras.metrics.Precision(thresholds=PRECISION_RECALL_THRESHOLDS)
recall = tf.keras.metrics.Recall(thresholds=PRECISION_RECALL_THRESHOLDS)
binacc = tf.keras.metrics.BinaryAccuracy(threshold=BINACC_THRESHOLD)
metrics = [precision, recall, binacc]
cbert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED, 'val_binary_accuracy', weights_only=True)
tensorboard_callback = get_tensorboard_callback('logs')

%tensorboard --logdir logs --bind_all
history = cbert_model.fit(train_ds_split,
                epochs=EPOCHS,
                validation_data=dev_ds_split,
                callbacks=[checkpoint_callback, tensorboard_callback])

### Training with ranking

In [3]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.BinaryCrossentropy()
precision = tf.keras.metrics.Precision(thresholds=PRECISION_RECALL_THRESHOLDS)
recall = tf.keras.metrics.Recall(thresholds=PRECISION_RECALL_THRESHOLDS)
binacc = tf.keras.metrics.BinaryAccuracy(threshold=BINACC_THRESHOLD)
metrics = {'out_s1': [precision, recall, binacc]}

In [6]:
cbert_model_ranking.compile(optimizer=optimizer, loss=loss, metrics=metrics)
checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED_RANKING, 'val_binary_accuracy', weights_only=True)
tensorboard_callback = get_tensorboard_callback('logs')

%tensorboard --logdir logs --bind_all
history = cbert_model_ranking.fit(train_sampling_dataset_split,
                epochs=1,
                validation_data=dev_sampling_dataset,
                callbacks=[checkpoint_callback, tensorboard_callback])

Launching TensorBoard...

KeyboardInterrupt: 

In [8]:
cbert_model = cbert_model_ranking.get_layer(name='tf_bert_for_sequence_classification')
cbert_model.save_weights(MODEL_PATH_BERT_TUNED_RANKING)

In [7]:
cbert_model_truenews = cbert_model_ranking
cbert_model_truenews.compile(optimizer=optimizer, loss=loss, metrics=metrics)
checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED_TRUENEWS, 'val_binary_accuracy', weights_only=True)
tensorboard_callback = get_tensorboard_callback('logs')

%tensorboard --logdir logs --bind_all
history = cbert_model_truenews.fit(train_truenews_dataset_split,
                epochs=1,
                validation_data=dev_truenews_dataset,
                callbacks=[checkpoint_callback, tensorboard_callback])



In [8]:
cbert_model = cbert_model_truenews.get_layer(name='tf_bert_for_sequence_classification')
cbert_model.save_weights(MODEL_PATH_BERT_TUNED_TRUENEWS)

### Load fine-tuned BERT model

In [5]:
cbert_model = load_bert_model(MODEL_PATH_BERT)
cbert_model.load_weights(MODEL_PATH_BERT_TUNED)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1cc83c32e0>

### Load fine-tuned BERT model + ranking

In [None]:
cbert_model = load_bert_model(MODEL_PATH_BERT)
cbert_model.load_weights(MODEL_PATH_BERT_TUNED_RANKING)

### Load fine-tuned BERT model + truenews

In [3]:
cbert_model = load_bert_model(MODEL_PATH_BERT)
cbert_model.load_weights(MODEL_PATH_BERT_TUNED_TRUENEWS)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/bert-base-german-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0f50780ac0>

In [4]:
def prediction_func(inps):
    outputs = cbert_model.predict(inps)
    return [l[0] for l in outputs.logits]

eval_data_bert = batch_predict(test_dataset, 100, prediction_func)
print('BERT/MAP: {}'.format(mean_average_precision(eval_data_bert)))
for k in [1, 5, 10]:

    print('BERT/P@{}: {}'.format(k, precision_at_k(eval_data_bert, k)))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
BERT/MAP: 0.3324119192396223
BERT/P@1: 0.21649484536082475
BERT/P@5: 0.13608247422680408
BERT/P@10: 0.12072901325478631


### Hyperparamerters

|     | BERT BASE | BERT SAMPLING | BERT TRUENEWS |
|-----|---------|---|---|
| BATCH_SIZE |  32  | 16 | 16 |
| EPOCHS |     5    | 1 | 1 |

### Results
|     | BERT BASE | BERT SAMPLING | BERT TRUENEWS |
|-----|---------|---|---|
| MAP |  0.47965098263440786    | 0.46114418386081657 | 0.3324119192396223 |
| P@1 |     0.42857142857142855    | 0.4020618556701031 | 0.21649484536082475 |
| P@5 |     0.18571428571428555    | 0.21649484536082456| 0.13608247422680408 |
| P@10 |     0.14285714285714268    | 0.15474963181148732 | 0.12072901325478631 |

