In [1]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import datetime
from transformers import BertTokenizer, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense, Lambda
from tensorflow.keras import backend as K
import tensorflow as tf
from evaluation import mean_average_precision, precision_at_k
from utils import *
from model import *

print('Tensorflow Version: {}'.format(tf.__version__))
# Load the TensorBoard notebook extension
%load_ext tensorboard
    
DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
DATA_PATH_FORMATED_TEST = '../data/GermanFakeNC_FORMATED_TEST.json'
DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
MODEL_PATH_BERT_TUNED = '../models/bert-base-german-cased-tuned/checkpoint.ckpt'
MODEL_PATH_BERT_TUNED_RANKING = '../models/bert-base-german-cased-tuned-ranking/checkpoint.ckpt'
DATASET_SIZE = 14765
DATASET_DEV_SPLIT = 0.8
NUM_SAMPLING_CANDIDATES = 5
BATCH_SIZE = 32
MAX_LEN = 134
LEARNING_RATE = 5e-5
BINACC_THRESHOLD = 0.1
PRECISION_RECALL_THRESHOLDS = [0.05, 0.1, 0.2, 0.5]
EPOCHS = 5

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH_BERT)

Tensorflow Version: 2.4.1


In [15]:
def read_data(path):
    with open(path) as json_file:
            return json.load(json_file)
        
def encode(sentences):
    return tokenizer(sentences, max_length=MAX_LEN, truncation=True, padding=True, return_tensors='tf')
        
def to_dataset(data):
    sentences = [d['org'] for d in data]
    encodings_ds = tf.data.Dataset.from_tensor_slices(encode(sentences))    
    encodings_ds = encodings_ds.map(lambda ex: {i:ex[i] for i in ex}) # Batch encoding to dictionary
    labels_ds = tf.data.Dataset.from_tensor_slices([d['lbl'] for d in data]).map(lambda lbl: tf.reshape(lbl, [1]))
    ids_ds = tf.data.Dataset.from_tensor_slices([d['article_id'] for d in data])
    return tf.data.Dataset.zip((ids_ds, encodings_ds, labels_ds))
        
train_data = read_data(DATA_PATH_FORMATED_TRAIN)
test_data = read_data(DATA_PATH_FORMATED_TEST)
    
train_ds = to_dataset(train_data).map(lambda ida, inp, lbl: (inp, lbl))
test_ds = to_dataset(test_data).map(lambda ida, inp, lbl: (ida, inp, lbl[0]))

num_train_examples = int(len(train_data) * DATASET_DEV_SPLIT)
train_ds_split = train_ds.take(num_train_examples)
train_ds_split = train_ds_split.shuffle(100, reshuffle_each_iteration=True).batch(BATCH_SIZE)
dev_ds_split = train_ds.skip(num_train_examples).batch(BATCH_SIZE)

train_sampling_dataset_size = int(train_dataset_size * NUM_SAMPLING_CANDIDATES * DATASET_DEV_SPLIT)
train_sampling_dataset = train_sampling_dataset.map(lambda x, y, cs: ({'in_s1': x, 'in_s2': cs}, {'out_s1': y,'out_diff': y}))
train_sampling_dataset_split = train_sampling_dataset.take(train_sampling_dataset_size).batch(BATCH_SIZE)
dev_sampling_dataset = train_sampling_dataset.skip(train_sampling_dataset_size).batch(BATCH_SIZE)

In [2]:
def input_parser_train(example):
    feature_description = {'input_ids': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'token_type_ids': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'attention_mask': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    inp = {'input_ids': parsed['input_ids'],
           'token_type_ids': parsed['token_type_ids'],
           'attention_mask': parsed['attention_mask']}
    return (inp, parsed['y'])

def input_parser_test(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64),
                           'input_ids': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'token_type_ids': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'attention_mask': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    inp = {'input_ids': parsed['input_ids'],
           'token_type_ids': parsed['token_type_ids'],
           'attention_mask': parsed['attention_mask']}
    return (parsed['article_id'], inp, parsed['y'])

def input_parser_cs(example):
    feature_description = {'input_ids1': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'token_type_ids1': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'attention_mask1': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'input_ids2': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'token_type_ids2': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'attention_mask2': tf.io.FixedLenFeature([MAX_LEN], dtype=tf.int64),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    inp = {'input_ids1': parsed['input_ids1'],
           'token_type_ids1': parsed['token_type_ids1'],
           'attention_mask1': parsed['attention_mask1'],
           'input_ids2': parsed['input_ids2'],
           'token_type_ids2': parsed['token_type_ids2'],
           'attention_mask2': parsed['attention_mask2']}
    return (inp, parsed['y'])

train_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TRAIN_BERT', input_parser_train)
train_sampling_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TRAIN_BERT_SAMPLING', input_parser_cs)
test_dataset = read_tfrecords(DATA_PATH_PROCESSED, 'TEST_BERT', input_parser_test)
test_dataset = test_dataset.map(lambda ida, x, y: (ida[0], x, y[0]))

num_train_examples = int(DATASET_SIZE * DATASET_DEV_SPLIT)
train_ds_split = train_dataset.take(num_train_examples)
train_ds_split = train_ds_split.shuffle(100, reshuffle_each_iteration=True).batch(BATCH_SIZE)
dev_ds_split = train_dataset.skip(num_train_examples).batch(BATCH_SIZE)

train_sampling_dataset_size = int(DATASET_SIZE* NUM_SAMPLING_CANDIDATES * DATASET_DEV_SPLIT)
train_sampling_dataset = train_sampling_dataset.map(lambda inp, y: (inp, {'out_s1': y,'out_diff': y}))
train_sampling_dataset_split = train_sampling_dataset.take(train_sampling_dataset_size).batch(BATCH_SIZE)
dev_sampling_dataset = train_sampling_dataset.skip(train_sampling_dataset_size).batch(BATCH_SIZE)

### Load initial pretrained model

In [18]:
cbert_model = load_bert_model(MODEL_PATH_BERT)

Some layers from the model checkpoint at ../models/bert-base-german-cased/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/bert-base-german-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


### Load fine-tuned weights

In [5]:
cbert_model.load_weights(MODEL_PATH_BERT_TUNED)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f1cc83c32e0>

### Load ranking model

In [3]:
cbert_model = load_bert_model(MODEL_PATH_BERT)

def cbert_model_forward(inp):
    outputs = cbert_model(inp)
    return outputs.logits

shape=(MAX_LEN,)
input_type=tf.int32

input_ids1 = Input(shape=shape, name='input_ids1', dtype=input_type)
attention_mask1 = Input(shape=shape, name='attention_mask1', dtype=input_type)
token_type_ids1 = Input(shape=shape, name='token_type_ids1', dtype=input_type)

input_ids2 = Input(shape=shape, name='input_ids2', dtype=input_type)
attention_mask2 = Input(shape=shape, name='attention_mask2', dtype=input_type)
token_type_ids2 = Input(shape=shape, name='token_type_ids2', dtype=input_type) 

cbert_model_ranking = build_ranking_model(cbert_model_forward,
                                          [input_ids1, attention_mask1, token_type_ids1],
                                          [input_ids2, attention_mask2, token_type_ids2])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/bert-base-german-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


In [5]:
cbert_model_ranking.summary()

Model: "ranking"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids1 (InputLayer)         [(None, 134)]        0                                            
__________________________________________________________________________________________________
attention_mask1 (InputLayer)    [(None, 134)]        0                                            
__________________________________________________________________________________________________
token_type_ids1 (InputLayer)    [(None, 134)]        0                                            
__________________________________________________________________________________________________
tf_bert_for_sequence_classifica TFSequenceClassifier 109082113   input_ids1[0][0]                 
                                                                 attention_mask1[0][0]      

### Training without ranking

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.BinaryCrossentropy()
precision = tf.keras.metrics.Precision(thresholds=PRECISION_RECALL_THRESHOLDS)
recall = tf.keras.metrics.Recall(thresholds=PRECISION_RECALL_THRESHOLDS)
binacc = tf.keras.metrics.BinaryAccuracy(threshold=BINACC_THRESHOLD)
metrics = [precision, recall, binacc]
cbert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED, 'val_binary_accuracy', weights_only=True)
tensorboard_callback = get_tensorboard_callback('logs')

%tensorboard --logdir logs --bind_all
history = cbert_model.fit(train_ds_split,
                epochs=EPOCHS,
                validation_data=dev_ds_split,
                callbacks=[checkpoint_callback, tensorboard_callback])

### Training with ranking

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.BinaryCrossentropy()
precision = tf.keras.metrics.Precision(thresholds=PRECISION_RECALL_THRESHOLDS)
recall = tf.keras.metrics.Recall(thresholds=PRECISION_RECALL_THRESHOLDS)
binacc = tf.keras.metrics.BinaryAccuracy(threshold=BINACC_THRESHOLD)
metrics = {'out_s1': [precision, recall, binacc]}
cbert_model_ranking.compile(optimizer=optimizer, loss=loss, metrics=metrics)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED_RANKING, 'val_binary_accuracy', weights_only=True)
tensorboard_callback = get_tensorboard_callback('logs')

%tensorboard --logdir logs --bind_all
history = cbert_model_ranking.fit(train_sampling_dataset_split,
                epochs=2,
                validation_data=dev_sampling_dataset,
                callbacks=[checkpoint_callback, tensorboard_callback])

Epoch 1/2
    242/Unknown - 27475s 113s/step - loss: 0.7863 - out_s1_loss: 0.2880 - out_diff_loss: 0.4983 - out_s1_precision_2: 0.1510 - out_s1_recall_2: 0.5038 - out_s1_binary_accuracy: 0.4008

In [6]:
def prediction_func(inps):
    outputs = cbert_model.predict(inps)
    return [l[0] for l in outputs.logits]

eval_data_bert = batch_predict(test_ds, 100, prediction_func)
print('BERT/MAP: {}'.format(mean_average_precision(eval_data_bert)))
for k in [1, 5, 10]:
    print('BERT/P@{}: {}'.format(k, precision_at_k(eval_data_bert, k)))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
BERT/MAP: 0.47965098263440786
Ranking/P@1: 0.42857142857142855
Ranking/P@5: 0.18571428571428555
Ranking/P@10: 0.14285714285714268


### Results
|     | BERT 2 Epochs| BERT 5 Epochs |
|-----|------|---------|
| MAP |   0.45336887554833294   |    0.47965098263440786    |
| P@1 |      |     0.42857142857142855    |
| P@5 |      |     0.18571428571428555    |
| P@10 |      |     0.14285714285714268    |