**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [31]:
import os
import datetime
import json
import itertools
from gensim.models import Word2Vec
import numpy as np
import spacy
from tensorflow.keras import Input
from tensorflow.keras import backend as K, initializers, regularizers, constraints
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Layer, Dropout, LSTM, Dense, InputLayer
import tensorflow as tf
from sklearn.model_selection import KFold

print('Tensorflow Version: {}'.format(tf.__version__))

DATA_PATH = '../data/GermanFakeNC.json'
DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
NUM_ARTICLES = 489
MODEL_NAME = "CLEF_2019_HANSEN"
MODEL_PATH_MAIN = '../models/' + MODEL_NAME
MODEL_PATH_W2V = '../models/w2v.model'
MODEL_PATH_SPACY = '../models/de_core_news_sm-2.3.0'
SEED = 12345
LSTM_HIDDEN_UNITS = 100
EPOCHS = 10
CROSS_VALIDATION_K_FOLDS = 19
DATASET_SIZE = 14765
BATCH_SIZE = 120

# Load preprocessing models
w2v_model = Word2Vec.load(MODEL_PATH_W2V)
spacy_model = spacy.load(MODEL_PATH_SPACY, disable=["vocab"])

# Load the TensorBoard notebook extension
%load_ext tensorboard

Tensorflow Version: 2.1.0
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


## Data preprocessing

In [94]:
def read_data(path):
    with open(path) as json_file:
        return json.load(json_file)

def count_matches(false_statement, sentence):
    count = 0
    sent_copy = sentence[:]
    for w in false_statement:
        if w in sent_copy:
            count += 1
            sent_copy.remove(w)
    return count


data = []
max_sent_len = 0
for article_id, article in enumerate(read_data(DATA_PATH)):
    # Concatenate article text
    text = article['Title'] + article['Teaser'] + article['Text']
    
    sentences = spacy_model(text).sents
    article_data = []
    for s in sentences:
        if len(s) > max_sent_len:
            max_sent_len = len(s)
        article_data.append({
            'article_id': article_id,
            'org': s.text,
            'lbl': True,
            'tokenized': [t.text for t in s],
            'tokenized_lower': [t.text.lower() for t in s]
        })
 
    # Label sentences
    # The sentences matching the most tokens with a false statement will be labeled as False
    false_statements = [article['False_Statement_1'], article['False_Statement_2'], article['False_Statement_3']]     
    for fs in false_statements:
        if fs != '':
            fs_words = [t.text for t in spacy_model(fs)]
            matches = [count_matches(fs_words, s) for s in [d['tokenized'] for d in article_data]]
            m = max(matches)
            max_indexes = [i for i, j in enumerate(matches) if j == m]
            
            # +++++++ DEBUG CODE - START +++++++++ #
            if article_id == 458:
                print("False Statement: {} \n\n".format(fs))
                for mi in max_indexes:
                    print(article_data[mi]['org'])
            # +++++++ DEBUG CODE - END +++++++++ #
                
            for mi in max_indexes:
                article_data[mi]['lbl'] = False
            
    data = data + article_data

False Statement: So übel wird gegen europäische & deutsche Frauen gehetzt!
 


– So übel wird gegen europäische & deutsche Frauen gehetzt!
– So übel wird gegen europäische & deutsche Frauen gehetzt!
„Weißer, christlicher Abschaum!“ – So übel wird gegen europäische & deutsche Frauen gehetzt!
False Statement: TÖCHTER DER EUROPÄER SOLLEN IHRE „SEXUELLE PFLICHT“ MIT MIGRANTEN ERFÜLLEN+++DER „SEXUELLE DSCHIHAD“ WIRD PROPAGIERT+++VERUNGLIMPFUNG VON BEHINDERTEN+++
 


3)TÖCHTER DER EUROPÄER SOLLEN IHRE
„SEXUELLE PFLICHT“
„SEXUELLE DSCHIHAD“
Christinnen werden als „weißer, christlicher Abschaum“ oder als „christliche Schlampen“ bezeichnet.
Hier: Vor allem die „Töchter“ der (weißen) Europäer geraten  in den Fokus,  die ihre „sexuelle Pflicht“ mit den Migranten erfüllen sollen.
Hier: Denn die Männer (Migranten) aus der „Dritten Welt“ sollen nach Europa kommen, um europäische Frauen zu „beglücken“.
Hier: Die scheinbar „arische Rasse „soll nicht von Waffengeräuschen erobert werden, sondern durch d

### Labeling tests
#### Options to match fake statements to sentences
* Test if sentence is in fake statement: matched 53.7% of false statements 
* Seperate into word tokens and test if some percetage of words is in a false statement
* Label sentence with most matching words as false statement

In [3]:
tf_stats = 0
for a in read_data(DATA_PATH):
    for number in ['1','2','3']:
        if a['False_Statement_' + number] != '':
            tf_stats += 1
            
cf_stats = len(list(filter(lambda d: not d['lbl'], data))) 
print("Number of all sentences {}".format(len(data)))
print("True number of false statements {}".format(tf_stats))
print("Classified number of false statements {} ({:.1f}%)".format(cf_stats, (cf_stats * 100) / tf_stats))

Number of all sentences 14765
True number of false statements 974
Classified number of false statements 1022 (104.9%)


### Dependency Parsing

In [3]:
def to_deps(doc, max_sent_len):
    oh_vectors = []
    for token in doc:
        vec = np.zeros(max_sent_len)
        vec[token.head.i] = 1
        oh_vectors.append(vec)
        
    # padding with 0 vectors to max sentence length
    while len(oh_vectors) < max_sent_len:
        oh_vectors.append(np.zeros(max_sent_len))
    return oh_vectors


In [4]:
for d in data:
    doc = spacy_model(d['org'])
    d['processed'] = to_deps(doc, max_sent_len)

### Word Embedding

In [5]:
def embed(sentence, max_sent_len):
    vectorized_sentence = []
    vector_dim = w2v_model.wv.vector_size
    for word in sentence:
        if word in w2v_model.wv:
            vectorized_sentence.append(w2v_model.wv[word])
        else:
            vectorized_sentence.append(np.zeros(vector_dim))
            
    # padding with 0 vectors to max sentence length
    while len(vectorized_sentence) < max_sent_len:
        vectorized_sentence.append(np.zeros(vector_dim))
        
    return vectorized_sentence


In [6]:
for d in data:
    d['processed'] = np.concatenate((embed(d['tokenized_lower'], max_sent_len), d['processed']), axis=1)

In [26]:
# labels are 0 for true and 1 for false statements
y = [0.0 if d['lbl'] == True else 1.0 for d in data]

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

chunk_size = 2000
id_chunks = chunks([d['article_id'] for d in data], chunk_size)
X_chunks = chunks([d['processed'] for d in data], chunk_size)
y_chunks = chunks(y, chunk_size)

for (i, (id_chunk, X_chunk, y_chunk)) in enumerate(zip(id_chunks, X_chunks, y_chunks)):
    writer = tf.io.TFRecordWriter(DATA_PATH_PROCESSED + '_{}'.format(i) + '.tfrecords')
    for (idc, xc, yc) in zip(id_chunk, X_chunk, y_chunk):
        # Convert to TFRecords and save to file
        feature = {
            'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[idc])),
            'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(xc).flatten())),
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[yc]))
        }
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        serialized = example.SerializeToString()
        writer.write(serialized)
    writer.close()

# Model Definition and Training

In [61]:
def input_parser(example):
    feature_description = {'article_id': tf.io.FixedLenFeature([1], dtype=tf.int64), 
                           'x': tf.io.FixedLenFeature([135, 285], dtype=tf.float32),
                           'y': tf.io.FixedLenFeature([1], dtype=tf.float32)}

    parsed_example = tf.io.parse_single_example(example, feature_description)
    return (parsed_example['article_id'], parsed_example['x'], parsed_example['y'])

data_files = tf.data.Dataset.list_files(DATA_PATH_PROCESSED + '_*.tfrecords')
raw_dataset = tf.data.TFRecordDataset(data_files)
dataset = raw_dataset.map(input_parser)

num_train_articles = int(0.8 * NUM_ARTICLES)
num_train_articles = tf.constant(num_train_articles, dtype=tf.int64)


rem_article_id = lambda ida, x, y: (x, y)
is_train_example = lambda ida, x, y: tf.squeeze(tf.math.less_equal(ida, num_train_articles))
is_no_train_example = lambda ida, x, y: tf.squeeze(tf.math.greater(ida, num_train_articles))

# model examples do not contain article_id
train_dataset = dataset.filter(is_train_example)
train_dataset = train_dataset.map(rem_article_id).shuffle(1000).batch(BATCH_SIZE)
test_dataset = dataset.filter(is_no_train_example)
test_dataset = test_dataset.map(rem_article_id).batch(BATCH_SIZE)

# the eval example do contain article_id to determine MAP
test_dataset_eval = dataset.filter(is_no_train_example)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Num'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Str'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Str'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has

### Model without ranking loss

In [22]:
# SOURCE: https://gist.github.com/cbaziotis/6428df359af27d58078ca5ed9792bd6d

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)
        
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'supports_masking': self.supports_masking,
            'return_attention': self.return_attention,
            'init': self.init,
            'W_regularizer': self.W_regularizer,
            'b_regularizer': self.b_regularizer,
            'W_constraint': self.W_constraint,
            'b_constraint': self.b_constraint,
            'bias': self.bias,
        })
        return config

    def build(self, input_shape):
        assert len(input_shape) == 3

        
        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

In [23]:
def build_model():
    inp_shape = (135, 285)
    model = Sequential(name='simple')
    model.add(LSTM(LSTM_HIDDEN_UNITS, input_shape=inp_shape, return_sequences = True, name='lstm'))
    model.add(Attention(name='attention'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid', name='dense'))
    return model

### Model training

In [None]:
for (fold, (train_index, val_index)) in enumerate(KFold(CROSS_VALIDATION_K_FOLDS).split(train_dataset)):
    X_train_fold, X_val = X_train[train_index], X_train[val_index]
    y_train_fold, y_val = y_train[train_index], y_train[val_index]
    
    model = build_model()
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(MODEL_PATH_MAIN + "_FOLD_{}".format(fold), 
                                                             monitor='val_accuracy', verbose=1, 
                                                             save_best_only=True, mode='max')
    
    logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

    history = model.fit(x=X_train_fold, y=y_train_fold,
                epochs=EPOCHS,
                callbacks=[checkpoint_callback, tensorboard_callback],
                validation_data=(X_val, y_val))
    
    K.clear_session()

In [None]:
model = build_model()
model.summary()
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(MODEL_PATH_MAIN, 
                                                         monitor='val_accuracy', verbose=1, 
                                                         save_best_only=True, mode='max')

log_dir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

history = model.fit(train_dataset,
            epochs=EPOCHS,
            callbacks=[checkpoint_callback, tensorboard_callback],
            validation_data=test_dataset)

### Evaluation

In [99]:
# load model
test_model = tf.keras.models.load_model(MODEL_PATH_MAIN)

#### Sample

In [55]:
# preprocess data
false_statement = "Um die Ermordung unschuldiger Zivilisten in Russland zu üben, sucht die NATO für ihre Manöver russischsprachige Menschen."
tokens = spacy_model(false_statement)
deps = to_deps(tokens, 135)
word_vecs = embed([t.text.lower() for t in tokens], 135)
inp = np.concatenate((word_vecs, deps), axis=1)

In [56]:
prediction = test_model.predict(np.array( [inp,] ))
print(prediction)

[[0.53837883]]


#### MAP

In [97]:
eval_data = list(test_dataset_eval.as_numpy_iterator())
eval_data = [(ida[0], x, y[0]) for ida, x, y in eval_data]

In [132]:
# MAP metric is based on the official CLEF2019 implementation: 
# https://github.com/apepa/clef2019-factchecking-task1/blob/7d463336897ad1f870cb6a481953b94550c788a7/scorer/main.py#L52

def mean_average_precision(data):
    avg_precisions = []
    article_ids = set([ida for ida, _, _ in data])
    num_articles = len(article_ids)
    
    for id_article in article_ids:
        article_examples = [(x,y) for ida, x, y in data if ida == id_article]
        xs = [x for x,y in article_examples]
        ys = [y for x,y in article_examples]
        
        num_positive = sum(ys)

        predictions = [p[0] for p in test_model.predict(np.array(xs))]
        ranked_indices = [i for i, v in sorted(enumerate(predictions), key=lambda tup: tup[1], reverse=True)]
        
        # ++++ DEBUG CODE - START +++ #
        #hits = []
        #for i in range(len(ranked_indices)):
        #   if ys[ranked_indices[i]] == 1:
        #        hits.append(1)
        #    else:
        #        hits.append(0)
        #print(hits)
        # ++++ DEBUG CODE - END   +++ #
        
        precisions = []
        num_correct = 0
        for i in range(len(ranked_indices)):
            if ys[ranked_indices[i]] == 1:
                num_correct += 1
                precisions.append(num_correct / (i + 1))
            
        if precisions:
            avg_precisions.append(sum(precisions) / num_positive)
        else:
            avg_precisions.append(0)
        
    return sum(avg_precisions) / num_articles
    
print('MAP: {}'.format(mean_average_precision(eval_data)))

MAP: 0.6367217658657178
