**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [1]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import datetime
import json
import itertools
from gensim.models import Word2Vec
import numpy as np
import spacy
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer
from utils import chunks

DATA_PATH = '../data/GermanFakeNC.json'
DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
DATA_PATH_FORMATED_TEST = '../data/GermanFakeNC_FORMATED_TEST.json'
DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
NUM_ARTICLES = 489
MODEL_PATH_W2V = '../models/w2v.model'
MODEL_PATH_SPACY = '../models/de_core_news_sm-2.3.0'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
SEED = 12345
NUM_SAMPLING_CANDIDATES = 5
DATASET_SIZE = 14765
DATASET_TRAIN_SPLIT = 0.8
DATASET_DEV_SPLIT = 0.8
CHUNK_SIZE = 2000

# Load preprocessing models
w2v_model = Word2Vec.load(MODEL_PATH_W2V)
spacy_model = spacy.load("de_core_news_sm")

## Data preprocessing

In [3]:
def read_data(path):
    with open(path) as json_file:
        return json.load(json_file)

def count_matches(false_statement, sentence):
    count = 0
    sent_copy = sentence[:]
    for w in false_statement:
        if w in sent_copy:
            count += 1
            sent_copy.remove(w)
    return count

def process_text(sentences, article_id,  max_sent_len):
    processed = []
    for s in sentences:
        # ignore sentences of length 1
        if len(s) <= 1:
            continue
        # ignore sentences consisting exclusively of punctuation
        if not any([not t.is_punct for t in s]):
            continue
        # ignore sentences not containing any letter
        if not any([any([c.isalpha() for c in t.text]) for t in s]):
            continue
        if len(s) > max_sent_len:
            max_sent_len = len(s)
        processed.append({
            'article_id': article_id,
            'org': s.text,
            'lbl': 0.0,
            'tokenized': [t.text for t in s],
            'tokenized_lower': [t.text.lower() for t in s]
        })
    return processed, max_sent_len

data = []
max_sent_len = 0
for article_id, article in enumerate(read_data(DATA_PATH)):
    title = spacy_model(article['Title']).sents
    teaser = spacy_model(article['Teaser']).sents
    text = spacy_model(article['Text']).sents
    
    p_title, max_sent_len = process_text(title, article_id, max_sent_len)
    p_teaser, max_sent_len = process_text(teaser, article_id, max_sent_len)
    p_text, max_sent_len = process_text(text, article_id, max_sent_len)
       
    article_data = p_title + p_teaser + p_text

    # Label sentences
    false_statements = [article['False_Statement_1'], article['False_Statement_2'], article['False_Statement_3']]     
    for fs in false_statements:
        if fs != '':
            fs_tokens = [t.text.lower() for t in spacy_model(fs)]
            matches = [count_matches(fs_tokens, t) for t in [d['tokenized_lower'] for d in article_data]]
            m = max(matches)
            max_indexes = [i for i, j in enumerate(matches) if j == m]
            
            # +++++++ DEBUG CODE - START +++++++++ #
            #if article_id == 400:
            #    print("\n\nFalse Statement: {} \n\n".format(fs))
            #    for mi in max_indexes:
            #        print(article_data[mi]['org'])
            # +++++++ DEBUG CODE - END   +++++++++ #
                
            for mi in max_indexes:
                article_data[mi]['lbl'] = 1.0
            
    data = data + article_data

### Labeling tests
#### Options to match fake statements to sentences
* Test if sentence is in fake statement: matched 53.7% of false statements 
* Seperate into word tokens and test if some percetage of words is in a false statement
* Label sentence with most matching words as false statement

In [6]:
tf_stats = 0
for a in read_data(DATA_PATH):
    for number in ['1','2','3']:
        if a['False_Statement_' + number] != '':
            tf_stats += 1
            
cf_stats = len(list(filter(lambda d: d['lbl'], data))) 
print("Number of all sentences {}".format(len(data)))
print("True number of false statements {}".format(tf_stats))
print("Classified number of false statements {} ({:.1f}%)".format(cf_stats, (cf_stats * 100) / tf_stats))

Number of all sentences 14062
True number of false statements 974
Classified number of false statements 1030 (105.7%)


### Dependency Parsing

In [22]:
def to_deps(doc, max_sent_len):
    oh_vectors = []
    for token in doc:
        vec = np.zeros(max_sent_len)
        vec[token.head.i] = 1
        oh_vectors.append(vec)
        
    # padding with 0 vectors to max sentence length
    while len(oh_vectors) < max_sent_len:
        oh_vectors.append(np.zeros(max_sent_len))
    return oh_vectors


In [35]:
for d in data:
    doc = spacy_model(d['org'])
    d['processed'] = to_deps(doc, max_sent_len)

### Word Embedding

In [23]:
def embed(sentence, max_sent_len):
    vectorized_sentence = []
    vector_dim = w2v_model.wv.vector_size
    for word in sentence:
        if word in w2v_model.wv:
            vectorized_sentence.append(w2v_model.wv[word])
        else:
            vectorized_sentence.append(np.zeros(vector_dim))
            
    # padding with 0 vectors to max sentence length
    while len(vectorized_sentence) < max_sent_len:
        vectorized_sentence.append(np.zeros(vector_dim))
        
    return vectorized_sentence


In [37]:
for d in data:
    embedded_words = embed(d['tokenized_lower'], max_sent_len)
    d['processed'] = np.concatenate((embedded_words, d['processed']), axis=1)

### Seperating data

In [4]:
# data is seperated by article because of MAP evaluation later
num_train_articles = int(DATASET_TRAIN_SPLIT * NUM_ARTICLES)
train_data = list(filter(lambda d: d['article_id'] <= num_train_articles, data))
test_data = list(filter(lambda d: d['article_id'] > num_train_articles, data))

### Serialization of formatted data 

In [5]:
with open(DATA_PATH_FORMATED_TRAIN, 'w') as fout:
    json.dump(train_data, fout)
    
with open(DATA_PATH_FORMATED_TEST, 'w') as fout:
    json.dump(test_data, fout)

### Contrastive Sampling

In [42]:
def compute_sentence_embeddings(data):
    word_vector_dim = w2v_model.wv.vector_size
    for d in data:
        word_embeddings = [w[:word_vector_dim] for w in d['processed']]
        yield np.mean(word_embeddings, axis=0)
        
def retrieve_topk_ixs(entry_index, data, k, sims):
    topk_stack = [(0,0)]
    
    for i, sim in enumerate(sims):
        is_greater = any([sim > tk_sim for (index, tk_sim) in topk_stack])
        negative_label = data[entry_index]['lbl'] != data[i]['lbl']
        not_own_sim = entry_index != i
        
        if is_greater and negative_label and not_own_sim: 
            if len(topk_stack) >= k:
                topk_stack.pop()

            topk_stack.append((i, sim))    
            topk_stack.sort(reverse=True)
    return [index for (index, sim) in topk_stack]

# only use train data
# no negative sampling for test data neccesary
sentence_embeddings = list(compute_sentence_embeddings(train_data))

similarities = cosine_similarity(sentence_embeddings, sentence_embeddings)

k = NUM_SAMPLING_CANDIDATES
processed_topk_candidates = []
for i, row_sims in enumerate(similarities):
    top_k_ixs = retrieve_topk_ixs(i, data, k, row_sims)
    
    top_k_processed = []    
    for top_k_ix in top_k_ixs:
        top_k_processed.append(train_data[top_k_ix]['processed']) 
    processed_topk_candidates.append(top_k_processed)
    

def assign_candidate(d, ptc):
    d_copy = dict(d)
    d_copy['cs'] = ptc
    return d_copy
    
train_data = [[assign_candidate(d, ptc) for ptc in ptcs] for d, ptcs in zip(train_data, processed_topk_candidates)]

flatten = lambda lst: [j for sub in lst for j in sub]
train_data = flatten(train_data)

## Serialization

### Serialization Hansen

In [43]:
def serialize_wsampling(sdata, chunk_size, file_suffix):
    aid_chunks = chunks([d['article_id'] for d in sdata], chunk_size)
    X_chunks = chunks([d['processed'] for d in sdata], chunk_size)
    y_chunks = chunks([d['lbl'] for d in sdata], chunk_size)
    cs_chunks = chunks([d['cs'] for d in sdata], chunk_size)

    zipped_chunks = zip(aid_chunks, X_chunks, y_chunks, cs_chunks)
    for (i, (aid_chunk, X_chunk, y_chunk, cs_chunk)) in enumerate(zipped_chunks):
        writer = tf.io.TFRecordWriter(DATA_PATH_PROCESSED + '_{}_{}'.format(file_suffix, i) + '.tfrecords')
        for (aidc, xc, yc, csc) in zip(aid_chunk, X_chunk, y_chunk, cs_chunk):
            # Convert to TFRecords and save to file
            feature = {
                'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[aidc])),
                'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(xc).flatten())),
                'y': tf.train.Feature(float_list=tf.train.FloatList(value=[yc])),
                'cs': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(csc).flatten()))
            }
            
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            serialized = example.SerializeToString()
            writer.write(serialized)
        writer.close()
        
def serialize(sdata, chunk_size, file_suffix):
    aid_chunks = chunks([d['article_id'] for d in sdata], chunk_size)
    X_chunks = chunks([d['processed'] for d in sdata], chunk_size)
    y_chunks = chunks([d['lbl'] for d in sdata], chunk_size)
    
    zipped_chunks = zip(aid_chunks, X_chunks, y_chunks)
    for (i, (aid_chunk, X_chunk, y_chunk)) in enumerate(zipped_chunks):
        writer = tf.io.TFRecordWriter(DATA_PATH_PROCESSED + '_{}_{}'.format(file_suffix, i) + '.tfrecords')
        for (aidc, xc, yc) in zip(aid_chunk, X_chunk, y_chunk):
            # Convert to TFRecords and save to file
            feature = {
                'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[aidc])),
                'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(xc).flatten())),
                'y': tf.train.Feature(float_list=tf.train.FloatList(value=[yc]))
            }
            
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            serialized = example.SerializeToString()
            writer.write(serialized)
        writer.close()

#### Serialize Base Model Data

In [44]:
serialize_wsampling(train_data, chunk_size, 'TRAIN_SAMPLING')
serialize(test_data, chunk_size, 'TEST')

#### Serialize Ranking Model Data

In [41]:
serialize(train_data, chunk_size, 'TRAIN')
serialize(test_data, chunk_size, 'TEST')