**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [5]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import datetime
import json
import itertools
import operator
from gensim.models import Word2Vec
import numpy as np
import spacy
from tensorflow.keras import Input
from tensorflow.keras import backend as K, initializers, regularizers, constraints
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Layer, Dropout, LSTM, Dense, InputLayer
from tensorflow.keras.losses import Loss
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from evaluation import mean_average_precision, precision_at_k
from utils import batch_predict

print('Tensorflow Version: {}'.format(tf.__version__))

DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
NUM_ARTICLES = 489
MODEL_NAME = "CLEF_2019_HANSEN"
MODEL_PATH_BASE = '../models/' + MODEL_NAME + '_BASE'
MODEL_PATH_RANKING = '../models/' + MODEL_NAME + '_RANKING'
SEED = 12345
NUM_SAMPLING_CANDIDATES = 5
LSTM_HIDDEN_UNITS = 100
EPOCHS = 10
CROSS_VALIDATION_K_FOLDS = 19
DATASET_SIZE = 14765
DATASET_TRAIN_SPLIT = 0.8
DATASET_DEV_SPLIT = 0.8
BATCH_SIZE = 120
DROPOUT = 0.3

# Load the TensorBoard notebook extension
%load_ext tensorboard

Tensorflow Version: 2.4.1
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Model Definition and Training

In [2]:
def input_parser(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64), 
                           'x': tf.io.FixedLenFeature([135, 285], dtype=tf.float32),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    return (parsed['article_id'],parsed['x'],parsed['y'])

def input_parser_cs(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64), 
                           'x': tf.io.FixedLenFeature([135, 285], dtype=tf.float32),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32),
                           'cs': tf.io.FixedLenFeature([135, 285], dtype=tf.float32)}

    parsed = tf.io.parse_single_example(example, feature_description)
    return (parsed['article_id'],parsed['x'],parsed['y'],parsed['cs'])

train_data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_TRAIN_*.tfrecords')
train_data_raw = tf.data.TFRecordDataset(train_data_files)
train_dataset = train_data_raw.map(input_parser)

train_sampling_data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_TRAIN_SAMPLING_*.tfrecords')
train_sampling_data_raw = tf.data.TFRecordDataset(train_sampling_data_files)
train_sampling_dataset = train_sampling_data_raw.map(input_parser_cs)

test_data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_TEST_*.tfrecords')
test_data_raw = tf.data.TFRecordDataset(test_data_files)
test_dataset = test_data_raw.map(input_parser)
test_dataset = test_dataset.map(lambda ida, x, y: (ida[0], x, y[0]))

# shuffling seems to produce an error, maybe include later again
#train_dataset = train_dataset.map(lambda ida, x, y, topk: (x, y, topk)).shuffle(1000).batch(BATCH_SIZE)

# there has already been a train/test data split in preprocessing
train_dataset_size = int(DATASET_SIZE * DATASET_TRAIN_SPLIT)

train_sampling_dataset_size = int(train_dataset_size * NUM_SAMPLING_CANDIDATES * DATASET_DEV_SPLIT)
train_sampling_dataset = train_sampling_dataset.map(lambda ida, x, y, cs: ({'in_s1': x, 'in_s2': cs}, {'out_s1': y,'out_diff': y}))
train_sampling_dataset_split = train_sampling_dataset.take(train_sampling_dataset_size).batch(BATCH_SIZE)
dev_sampling_dataset = train_sampling_dataset.skip(train_sampling_dataset_size).batch(BATCH_SIZE)

train_dataset_size = int(DATASET_SIZE * DATASET_DEV_SPLIT)
train_dataset = train_dataset.map(lambda ida, x, y: (x, y))
train_dataset_split = train_dataset.take(train_dataset_size).batch(BATCH_SIZE)
dev_dataset = train_dataset.skip(train_dataset_size).batch(BATCH_SIZE)

### Model definitions

In [4]:
# SOURCE: https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'supports_masking': self.supports_masking,
            'return_attention': self.return_attention,
            'init': self.init,
            'W_regularizer': self.W_regularizer,
            'b_regularizer': self.b_regularizer,
            'W_constraint': self.W_constraint,
            'b_constraint': self.b_constraint,
            'bias': self.bias,
        })
        return config

    def build(self, input_shape):
        assert len(input_shape) == 3

        
        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

In [5]:
def build_base_model(model_name='base'):
    inp_shape = (135, 285)
    model = Sequential(name=model_name)
    model.add(LSTM(LSTM_HIDDEN_UNITS, input_shape=inp_shape, return_sequences = True, name='lstm'))
    model.add(Attention(name='attention'))
    model.add(Dropout(DROPOUT))
    model.add(Dense(1, activation='sigmoid', name='dense'))
    return model

In [9]:
def build_ranking_model():
    inp_shape = (135, 285)
    
    in_s1 = Input(inp_shape, name='in_s1')
    in_s2 = Input(inp_shape, name='in_s2')
    
    base_model = build_base_model()
    
    out_s1 = base_model(in_s1)
    out_s1 = Layer(name='out_s1')(tf.identity(out_s1))
    out_s2 = base_model(in_s2)
    out_diff = Layer(name='out_diff')(tf.math.subtract(out_s1, out_s2, name='out_diff'))
    
    model = tf.keras.Model(inputs=[in_s1, in_s2], outputs=[out_s1, out_diff], name='ranking')
    
    return model

### Model training

In [6]:
def get_checkpoint_callback(model_path, monitor_value):
    return tf.keras.callbacks.ModelCheckpoint(model_path, 
                                              monitor=monitor_value, verbose=1, 
                                              save_best_only=True, mode='max')

log_dir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

In [None]:
model = build_base_model()
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_BASE, 'val_binary_accuracy')

history = model.fit(train_dataset_split,
            epochs=EPOCHS,
            callbacks=[checkpoint_callback, tensorboard_callback],
            validation_data=dev_dataset)

In [None]:
class RankingError(Loss):    
    def call(self, y_true, y_diff):
        pos = tf.constant([1.0 for i in range(BATCH_SIZE)])
        neg = tf.constant([-1.0 for i in range(BATCH_SIZE)])
        sign = tf.where(tf.equal(y_true,1.0), pos, neg)

        return tf.math.maximum(0.0, 1.0 - sign * y_diff)
    
    
model = build_ranking_model()
tf.keras.utils.plot_model(model, show_shapes=True)

model.compile(
    optimizer='adam',
    loss=[
        tf.keras.losses.BinaryCrossentropy(),
        RankingError(),
    ],
    loss_weights=[0.5, 0.5],
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

checkpoint_callback = get_checkpoint_callback(MODEL_PATH_RANKING, 'val_out_s1_binary_accuracy')

history = model.fit(train_sampling_dataset_split,
            epochs=2,
            callbacks=[checkpoint_callback, tensorboard_callback],
            validation_data=dev_sampling_dataset)

### Evaluation

In [3]:
# load base model
test_model_base = tf.keras.models.load_model(MODEL_PATH_BASE)

#### Sample prediction for base model

In [37]:
# preprocess data
false_statement = "Um die Ermordung unschuldiger Zivilisten in Russland zu üben, sucht die NATO für ihre Manöver russischsprachige Menschen."
tokens = spacy_model(false_statement)
deps = to_deps(tokens, 135)
word_vecs = embed([t.text.lower() for t in tokens], 135)
inp = np.concatenate((word_vecs, deps), axis=1)
print(len(inp[0]))

285


In [38]:
prediction = test_model_base.predict(np.array( [inp,] ))
print(prediction)

[[0.3697008]]


In [7]:
# load ranking model
test_model_ranking = tf.keras.models.load_model(MODEL_PATH_RANKING, compile=False)
test_model_ranking = test_model_ranking.get_layer(name='base')

In [45]:
prediction = test_model_ranking.predict(np.array( [inp,] ))
print(prediction)

[[0.08147946]]


#### MAP

In [6]:
def prediction_func_base(inps):
    return [p[0] for p in test_model_base.predict(inps)]    

eval_data_base = batch_predict(test_dataset, 100, prediction_func_base)
print('Base/MAP: {}'.format(mean_average_precision(eval_data_base)))
for k in [1, 5, 10]:
    print('Base/P@{}: {}'.format(k, precision_at_k(eval_data_base, k)))

Base/MAP: 0.3609851566446917
Base/P@1: 0.24489795918367346
Base/P@5: 0.1734693877551019
Base/P@10: 0.13265306122448964


In [8]:
def prediction_func_ranking(inps):
    return [p[0] for p in test_model_ranking.predict(inps)]   

eval_data_ranking = batch_predict(test_dataset, 100, prediction_func_ranking)
print('Ranking/MAP: {}'.format(mean_average_precision(eval_data_ranking)))
for k in [1, 5, 10]:
    print('Ranking/P@{}: {}'.format(k, precision_at_k(eval_data_ranking, k)))

Ranking/MAP: 0.34886888634919144
Ranking/P@1: 0.22448979591836735
Ranking/P@5: 0.1795918367346937
Ranking/P@10: 0.1397959183673468


### Results
|     | Base | Ranking |
|-----|------|---------|
| MAP |   0.3609851566446917   |  0.34886888634919144      |
| P@1 |   0.24489795918367346   |    0.22448979591836735     |
| P@5 |   0.1734693877551019   |    0.1795918367346937     |
| P@10 |   0.13265306122448964   |    0.1397959183673468     |