**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [7]:
import os
import sys

# workaround to import local modules from parent directory
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import datetime
import json
import itertools
from gensim.models import Word2Vec
import numpy as np
import spacy
import tensorflow as tf
import utils
from utils import read_json_data, write_json_data, write_tfrecords
from preprocessing import *

DATA_PATH = '../data/GermanFakeNC.json'
DATA_PATH_FORMATED_TRAIN = '../data/GermanFakeNC_FORMATED_TRAIN.json'
DATA_PATH_FORMATED_TEST = '../data/GermanFakeNC_FORMATED_TEST.json'
DATA_PATH_PROCESSED = '../data/GermanFakeNC_PROCESSED'
MODEL_PATH_W2V = '../models/w2v.model'
MODEL_PATH_SPACY = '../models/de_core_news_sm-2.3.0'
MODEL_PATH_BERT = '../models/bert-base-german-cased/'
SEED = 12345
NUM_SAMPLING_CANDIDATES = 5
DATASET_SIZE = 14765
DATASET_TRAIN_SPLIT = 0.8
DATASET_DEV_SPLIT = 0.8
CHUNK_SIZE = 2000

# Load preprocessing models
w2v_model = Word2Vec.load(MODEL_PATH_W2V)
spacy_model = spacy.load("de_core_news_sm")

## Data preprocessing

In [2]:
raw_data = read_json_data(DATA_PATH)
data, max_sent_len = format_germanfc(raw_data, spacy_model)

### Labeling tests
#### Options to match fake statements to sentences
* Test if sentence is in fake statement: matched 53.7% of false statements 
* Seperate into word tokens and test if some percetage of words is in a false statement
* Label sentence with most matching words as false statement

In [4]:
tf_stats = 0
for a in raw_data:
    for number in ['1','2','3']:
        if a['False_Statement_' + number] != '':
            tf_stats += 1
            
cf_stats = len(list(filter(lambda d: d['lbl'], data))) 
print("Number of all sentences {}".format(len(data)))
print("True number of false statements {}".format(tf_stats))
print("Classified number of false statements {} ({:.1f}%)".format(cf_stats, (cf_stats * 100) / tf_stats))

Number of all sentences 14062
True number of false statements 974
Classified number of false statements 1030 (105.7%)


## Seperating data

In [3]:
train_data, test_data = split_dataset(data, DATASET_TRAIN_SPLIT)

### Serialization of formatted data 

## Processing Data Hansen

In [4]:
train_data = process_hansen(train_data, max_sent_len, w2v_model, spacy_model)
test_data = process_hansen(test_data, max_sent_len, w2v_model, spacy_model)

## Contrastive Sampling

In [11]:
train_data = contrastive_sampling(train_data, w2v_model, NUM_SAMPLING_CANDIDATES)

## Serialization

### Serialization Hansen

#### Serialize Base Model Data

In [None]:
data_keys = ['article_id', 'processed', 'lbl']

def feature_func(ex):
    aid, x, y = ex
    return {'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[aid])),
            'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(x).flatten())),
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y]))}

write_tfrecords(train_data,CHUNK_SIZE, DATA_PATH_PROCESSED, 'TRAIN', data_keys, feature_func)
write_tfrecords(test_data, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TEST', data_keys, feature_func)

#### Serialize Ranking Model Data

In [None]:
data_keys = ['article_id', 'processed', 'lbl', 'cs']

def feature_func(ex):
    aid, x, y, cs = ex
    return {'article_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[aid])),
            'x': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(x).flatten())),
            'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y])),
            'cs': tf.train.Feature(float_list=tf.train.FloatList(value=np.stack(cs).flatten()))}

write_tfrecords(train_data, CHUNK_SIZE, DATA_PATH_PROCESSED, 'TRAIN_SAMPLING_tt', data_keys, feature_func)

### Serialization BERT