**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [1]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import datetime
import json
import itertools
from gensim.models import Word2Vec
import numpy as np
import spacy
import tensorflow as tf
from transformers import BertTokenizer
import utils
from utils import read_json_data, write_json_data, write_tfrecords
from preprocessing import *

DATA_PATH = '../data/GermanFakeNC.json'
DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
DATA_PATH_FORMATED_TEST = '../data/GermanFakeNC_FORMATED_TEST.json'
DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
MODEL_PATH_W2V = '../models/w2v.model'
MODEL_PATH_SPACY = '../models/de_core_news_sm-2.3.0'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
SEED = 12345
NUM_SAMPLING_CANDIDATES = 5
DATASET_SIZE = 14765
DATASET_TRAIN_SPLIT = 0.8
DATASET_DEV_SPLIT = 0.8
CHUNK_SIZE = 2000

# Load preprocessing models
w2v_model = Word2Vec.load(MODEL_PATH_W2V)
spacy_model = spacy.load("de_core_news_sm")
bert_tokenizer = BertTokenizer.from_pretrained(MODEL_PATH_BERT)

## Data preprocessing

In [2]:
raw_data = read_json_data(DATA_PATH)
data, max_sent_len = format_germanfc(raw_data, spacy_model)

### Labeling tests
#### Options to match fake statements to sentences
* Test if sentence is in fake statement: matched 53.7% of false statements 
* Seperate into word tokens and test if some percetage of words is in a false statement
* Label sentence with most matching words as false statement

In [4]:
tf_stats = 0
for a in raw_data:
    for number in ['1','2','3']:
        if a['False_Statement_' + number] != '':
            tf_stats += 1
            
cf_stats = len(list(filter(lambda d: d['lbl'], data))) 
print("Number of all sentences {}".format(len(data)))
print("True number of false statements {}".format(tf_stats))
print("Classified number of false statements {} ({:.1f}%)".format(cf_stats, (cf_stats * 100) / tf_stats))

Number of all sentences 14062
True number of false statements 974
Classified number of false statements 1030 (105.7%)


## Seperating data

In [3]:
train_data, test_data = split_dataset(data, DATASET_TRAIN_SPLIT)

### Serialization of formatted data 

In [None]:
write_json_data(train_data, DATA_PATH_FORMATED_TRAIN)
write_json_data(test_data, DATA_PATH_FORMATED_TEST)

## Processing Data Hansen

In [4]:
train_data = process_hansen(train_data, max_sent_len, w2v_model, spacy_model)
test_data = process_hansen(test_data, max_sent_len, w2v_model, spacy_model)

## Serialization

### Serialization Hansen

In [7]:
data_keys_train = ['processed', 'lbl']
data_keys_test = ['article_id', 'processed', 'lbl']

def feature_func_train(ex):
    x, y = ex
    return {'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(x).flatten())),
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y]))}

def feature_func_test(ex):
    aid, x, y = ex
    return {'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[aid])),
            'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(x).flatten())),
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y]))}

write_tfrecords(train_data,CHUNK_SIZE, DATA_PATH_PROCESSED, 'TRAIN_BASE', data_keys_train, feature_func_train)
write_tfrecords(test_data, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TEST_BASE', data_keys_test, feature_func_test)

## Contrastive Sampling

In [6]:
train_data_sampling = contrastive_sampling(train_data, w2v_model, NUM_SAMPLING_CANDIDATES)

#### Serialize Ranking Model Data

In [10]:
data_keys = ['processed', 'lbl', 'cs']

def feature_func(ex):
    x, y, cs = ex
    return {'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(x).flatten())),
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y])),
            'cs': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(cs).flatten()))}

write_tfrecords(train_data_sampling, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TRAIN_SAMPLING', data_keys, feature_func)

## Processing data BERT

In [5]:
train_data = process_bert(train_data, max_sent_len, bert_tokenizer)
test_data = process_bert(test_data, max_sent_len, bert_tokenizer)

In [None]:
data_keys_train = ['input_ids', 'token_type_ids', 'attention_mask', 'lbl']
data_keys_test = ['article_id', 'input_ids', 'token_type_ids', 'attention_mask', 'lbl']

def feature_func_train(ex):
    inp_ids, token_ids, att_mask, y = ex
    return {'input_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=inp_ids)),
            'token_type_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=token_ids)),
            'attention_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=att_mask)),           
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y]))}

def feature_func_test(ex):
    aid, inp_ids, token_ids, att_mask, y = ex
    return {'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[aid])),
            'input_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=inp_ids)),
            'token_type_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=token_ids)),
            'attention_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=att_mask)),           
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y]))}

write_tfrecords(train_data_bert, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TRAIN_BERT', data_keys_train, feature_func_train)
write_tfrecords(test_data_bert, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TEST_BERT', data_keys_test, feature_func_test)

#### Serialize Ranking Model Data

Data has to undergo contrastive sampling after beeing processed for Hansen et al. implementation.
Only then can this step be carried out, because a sentence embedding is used to measure similarity.

In [7]:
for i in range(len(train_data_sampling)):
    cs_ix  = train_data_sampling[i]['cs_ix']
    
    # rename field dict fields
    train_data_sampling[i]['input_ids1'] = train_data_sampling[i].pop('input_ids')
    train_data_sampling[i]['token_type_ids1'] = train_data_sampling[i].pop('token_type_ids')
    train_data_sampling[i]['attention_mask1'] = train_data_sampling[i].pop('attention_mask')
    
    train_data_sampling[i]['input_ids2'] = train_data[cs_ix]['input_ids']
    train_data_sampling[i]['token_type_ids2'] = train_data[cs_ix]['token_type_ids']
    train_data_sampling[i]['attention_mask2'] = train_data[cs_ix]['attention_mask']

In [8]:
data_keys_sampling = ['input_ids1', 'token_type_ids1', 'attention_mask1',
                      'input_ids2', 'token_type_ids2', 'attention_mask2',
                      'lbl']

def feature_func_sampling(ex):
    inp_ids1, token_ids1, att_mask1, inp_ids2, token_ids2, att_mask2, y = ex
    feature_i64 = lambda x: tf.train.Feature(int64_list=tf.train.Int64List(value=x))
    return {'input_ids1': feature_i64(inp_ids1),
            'token_type_ids1': feature_i64(token_ids1),
            'attention_mask1': feature_i64(att_mask1),   
            'input_ids2': feature_i64(inp_ids2),
            'token_type_ids2': feature_i64(token_ids2),
            'attention_mask2': feature_i64(att_mask2), 
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y]))}

write_tfrecords(train_data_sampling, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TRAIN_BERT_SAMPLING', data_keys_sampling, feature_func_sampling)