**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [1]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import datetime
import json
import itertools
import operator
from gensim.models import Word2Vec
import numpy as np
import spacy
from tensorflow.keras import Input
from tensorflow.keras import backend as K, initializers, regularizers, constraints
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Layer, Dropout, LSTM, Dense, InputLayer
from tensorflow.keras.losses import Loss
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from evaluation import mean_average_precision, precision_at_k
from utils import *
from model import *

print('Tensorflow Version: {}'.format(tf.__version__))

DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
NUM_ARTICLES = 489
MODEL_NAME = "CLEF_2019_HANSEN"
MODEL_PATH_BASE = '../models/' + MODEL_NAME + '_BASE'
MODEL_PATH_RANKING = '../models/' + MODEL_NAME + '_RANKING'
SEED = 12345
NUM_SAMPLING_CANDIDATES = 5
LSTM_HIDDEN_UNITS = 100
EPOCHS = 10
CROSS_VALIDATION_K_FOLDS = 19
DATASET_SIZE = 14765
DATASET_TRAIN_SPLIT = 0.8
DATASET_DEV_SPLIT = 0.8
BATCH_SIZE = 120
DROPOUT = 0.3

# Load the TensorBoard notebook extension
%load_ext tensorboard

Tensorflow Version: 2.4.1


# Model Definition and Training

In [2]:
def input_parser(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64), 
                           'x': tf.io.FixedLenFeature([135, 285], dtype=tf.float32),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    return (parsed['article_id'],parsed['x'],parsed['y'])

def input_parser_cs(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64), 
                           'x': tf.io.FixedLenFeature([135, 285], dtype=tf.float32),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32),
                           'cs': tf.io.FixedLenFeature([135, 285], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    return (parsed['article_id'],parsed['x'],parsed['y'],parsed['cs'])

train_data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_TRAIN_*.tfrecords')
train_data_raw = tf.data.TFRecordDataset(train_data_files)
train_dataset = train_data_raw.map(input_parser)

train_sampling_data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_TRAIN_SAMPLING_*.tfrecords')
train_sampling_data_raw = tf.data.TFRecordDataset(train_sampling_data_files)
train_sampling_dataset = train_sampling_data_raw.map(input_parser_cs)

test_data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_TEST_*.tfrecords')
test_data_raw = tf.data.TFRecordDataset(test_data_files)
test_dataset = test_data_raw.map(input_parser)
test_dataset = test_dataset.map(lambda ida, x, y: (ida[0], x, y[0]))

# shuffling seems to produce an error, maybe include later again
#train_dataset = train_dataset.map(lambda ida, x, y, topk: (x, y, topk)).shuffle(1000).batch(BATCH_SIZE)

# there has already been a train/test data split in preprocessing
train_dataset_size = int(DATASET_SIZE * DATASET_TRAIN_SPLIT)

train_sampling_dataset_size = int(train_dataset_size * NUM_SAMPLING_CANDIDATES * DATASET_DEV_SPLIT)
train_sampling_dataset = train_sampling_dataset.map(lambda ida, x, y, cs: ({'in_s1': x, 'in_s2': cs}, {'out_s1': y,'out_diff': y}))
train_sampling_dataset_split = train_sampling_dataset.take(train_sampling_dataset_size).batch(BATCH_SIZE)
dev_sampling_dataset = train_sampling_dataset.skip(train_sampling_dataset_size).batch(BATCH_SIZE)

train_dataset_size = int(DATASET_SIZE * DATASET_DEV_SPLIT)
train_dataset = train_dataset.map(lambda ida, x, y: (x, y))
train_dataset_split = train_dataset.take(train_dataset_size).batch(BATCH_SIZE)
dev_dataset = train_dataset.skip(train_dataset_size).batch(BATCH_SIZE)

### Model training

In [3]:
model = build_base_model(input_shape = (135, 285),
                         hidden_units = LSTM_HIDDEN_UNITS,
                         dropout_prob = DROPOUT)
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BASE, 'val_binary_accuracy')
tensorboard_callback = get_tensorboard_callback('logs')

history = model.fit(train_dataset_split,
            epochs=EPOCHS,
            callbacks=[checkpoint_callback, tensorboard_callback],
            validation_data=dev_dataset)

Model: "base"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 135, 100)          154400    
_________________________________________________________________
attention (Attention)        (None, 100)               235       
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 154,736
Trainable params: 154,736
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
     10/Unknown - 6s 432ms/step - loss: 0.6614 - binary_accuracy: 0.8409 - precision: 0.0579 - recall: 0.1716

KeyboardInterrupt: 

In [4]:
base_model = build_base_model(input_shape=(135, 285),
                                      hidden_units=LSTM_HIDDEN_UNITS,
                                      dropout_prob=DROPOUT)

in_s1 = Input(shape=(None, None), name='in_s1')
in_s2 = Input(shape=(None, None), name='in_s2')
model = build_ranking_model((lambda inp: base_model(inp)), in_s1, in_s2)

tf.keras.utils.plot_model(model, show_shapes=True)

model.compile(
    optimizer='adam',
    loss=[
        tf.keras.losses.BinaryCrossentropy(),
        RankingError(batch_size=BATCH_SIZE),
    ],
    loss_weights=[0.5, 0.5],
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_RANKING, 'val_out_s1_binary_accuracy')
tensorboard_callback = get_tensorboard_callback('logs')

history = model.fit(train_sampling_dataset_split,
            epochs=2,
            callbacks=[checkpoint_callback, tensorboard_callback],
            validation_data=dev_sampling_dataset)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')
Epoch 1/2
      9/Unknown - 8s 562ms/step - loss: 0.8375 - out_s1_loss: 0.6722 - out_diff_loss: 1.0028 - out_s1_binary_accuracy: 0.6556 - out_diff_binary_accuracy: 0.9103

KeyboardInterrupt: 

### Evaluation

In [3]:
# load base model
test_model_base = tf.keras.models.load_model(MODEL_PATH_BASE)

#### Sample prediction for base model

In [37]:
# preprocess data
false_statement = "Um die Ermordung unschuldiger Zivilisten in Russland zu üben, sucht die NATO für ihre Manöver russischsprachige Menschen."
tokens = spacy_model(false_statement)
deps = to_deps(tokens, 135)
word_vecs = embed([t.text.lower() for t in tokens], 135)
inp = np.concatenate((word_vecs, deps), axis=1)
print(len(inp[0]))

285


In [38]:
prediction = test_model_base.predict(np.array( [inp,] ))
print(prediction)

[[0.3697008]]


In [4]:
# load ranking model
test_model_ranking = tf.keras.models.load_model(MODEL_PATH_RANKING, compile=False)
test_model_ranking = test_model_ranking.get_layer(name='base')

In [45]:
prediction = test_model_ranking.predict(np.array( [inp,] ))
print(prediction)

[[0.08147946]]


#### MAP

In [4]:
def prediction_func_base(inps):
    return [p[0] for p in test_model_base.predict(inps)]    

eval_data_base = batch_predict(test_dataset, 100, prediction_func_base)
print('Base/MAP: {}'.format(mean_average_precision(eval_data_base)))
for k in [1, 5, 10]:
    print('Base/P@{}: {}'.format(k, precision_at_k(eval_data_base, k)))

Base/MAP: 0.3609851566446917
Base/P@1: 0.24489795918367346
Base/P@5: 0.1734693877551019
Base/P@10: 0.13265306122448964


In [5]:
def prediction_func_ranking(inps):
    return [p[0] for p in test_model_ranking.predict(inps)]   

eval_data_ranking = batch_predict(test_dataset, 100, prediction_func_ranking)
print('Ranking/MAP: {}'.format(mean_average_precision(eval_data_ranking)))
for k in [1, 5, 10]:
    print('Ranking/P@{}: {}'.format(k, precision_at_k(eval_data_ranking, k)))

Ranking/MAP: 0.37321464176223473
Ranking/P@1: 0.2653061224489796
Ranking/P@5: 0.18775510204081625
Ranking/P@10: 0.14483317136378343


### Results
|     | Base | Ranking |
|-----|------|---------|
| MAP |   0.3609851566446917   |  0.37321464176223473      |
| P@1 |   0.24489795918367346   |    0.2653061224489796     |
| P@5 |   0.1734693877551019   |    0.18775510204081625     |
| P@10 |   0.13265306122448964   |    0.14483317136378343     |