In [3]:
import json
from transformers import BertTokenizer, TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense
import tensorflow as tf

DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
MODEL_PATH_BERT_TUNED = '../models/bert-base-german-cased-tuned/'
DATASET_DEV_SPLIT = 0.8
BATCH_SIZE = 60

In [6]:
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH_BERT)
cbert_model = TFBertForSequenceClassification.from_pretrained(MODEL_PATH_BERT)

Some layers from the model checkpoint at ../models/bert-base-german-cased/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ../models/bert-base-german-cased/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [7]:
def read_data(path):
    with open(path) as json_file:
            return json.load(json_file)
train_data = read_data(DATA_PATH_FORMATED_TRAIN)
        
text_data = [d['org'] for d in train_data]

train_encodings = tokenizer(text_data, max_length=128, add_special_tokens=True, truncation=True, padding=True, return_tensors='tf')
train_encodings['label'] = tf.convert_to_tensor([d['lbl'] for d in train_data])

dataset = tf.data.Dataset.from_tensor_slices(train_encodings)
dataset = dataset.map(lambda ex: ({i:ex[i] for i in ex if i!='label'}, tf.reshape(ex['label'], [1])))

num_train_examples = int(len(train_data) * DATASET_DEV_SPLIT)
train_ds = dataset.take(num_train_examples).batch(BATCH_SIZE)
dev_ds = dataset.skip(num_train_examples).batch(BATCH_SIZE)

In [8]:
def get_checkpoint_callback(model_path, monitor_value):
    return tf.keras.callbacks.ModelCheckpoint(model_path, 
                                              monitor=monitor_value, verbose=1, 
                                              save_best_only=True, mode='max')

optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.BinaryCrossentropy()
metrics = [tf.keras.metrics.BinaryAccuracy(threshold=0.5)]
cbert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BERT_TUNED, 'val_binary_accuracy')
cbert_model.fit(train_ds,
                epochs=1,
                validation_data=dev_ds,
                callbacks=[checkpoint_callback])


























Epoch 00001: val_binary_accuracy improved from -inf to 0.93382, saving model to ../models/bert-base-german-cased-tuned/


<tensorflow.python.keras.callbacks.History at 0x7fad8906f700>

In [None]:
cbert_model.save(MODEL_PATH_BERT_TUNED)

In [9]:
cbert_model = tf.keras.models.load_model(MODEL_PATH_BERT_TUNED)



