**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [1]:
import json
from gensim.models import Word2Vec
import numpy as np
import spacy
from keras import Input
from keras import backend as K, initializers, regularizers, constraints
from keras.models import Sequential
from keras.layers import Dropout, LSTM, Dense, InputLayer
from keras.engine.topology import Layer

DATA_PATH = '../data/GermanFakeNC.json'
MODEL_PATH_W2V = '../models/w2v.model'
MODEL_PATH_SPACY = '../models/de_core_news_sm-2.3.0'

# load preprocessing models
w2v_model = Word2Vec.load(MODEL_PATH_W2V)
spacy_model = spacy.load(MODEL_PATH_SPACY, disable=["vocab"])

unable to import 'smart_open.gcs', disabling that module
Using TensorFlow backend.


In [2]:
def read_data(path):
    with open(path) as json_file:
        return json.load(json_file)

def count_matches(false_statement, sentence):
    count = 0
    sent_copy = sentence[:]
    for w in false_statement:
        if w in sent_copy:
            count += 1
            sent_copy.remove(w)
    return count


data = []
max_sent_len = 0
for article in read_data(DATA_PATH):
    # Concatenate article text
    text = article['Title'] + article['Teaser'] + article['Text']
    
    sentences = spacy_model(text).sents
    article_data = []
    for s in sentences:
        if len(s) > max_sent_len:
            max_sent_len = len(s)
        article_data.append({
            'org': s.text,
            'lbl': True,
            'tokenized': [t.text for t in s],
            'tokenized_lower': [t.text.lower() for t in s]
        })
 
    # Label sentences
    # The sentences matching the most tokens with a false statement will be labeled as False
    false_statements = [article['False_Statement_1'], article['False_Statement_2'], article['False_Statement_3']]     
    for fs in false_statements:
        if fs != '':
            fs_words = [t.text for t in spacy_model(fs)]
            matches = [count_matches(fs_words, s) for s in [d['tokenized'] for d in article_data]]
            m = max(matches)
            max_indexes = [i for i, j in enumerate(matches) if j == m]
            for mi in max_indexes:
                article_data[mi]['lbl'] = False
            
    data = data + article_data

### Labeling tests
#### Options to match fake statements to sentences
* Test if sentence is in fake statement: matched 53.7% of false statements 
* Seperate into word tokens and test if some percetage of words is in a false statement
* Label sentence with most matching words as false statement

In [3]:
tf_stats = 0
for a in read_data(DATA_PATH):
    for number in ['1','2','3']:
        if a['False_Statement_' + number] != '':
            tf_stats += 1
            
cf_stats = len(list(filter(lambda d: not d['lbl'], data))) 
print("Number of all sentences {}".format(len(data)))
print("True number of false statements {}".format(tf_stats))
print("Classified number of false statements {} ({:.1f}%)".format(cf_stats, (cf_stats * 100) / tf_stats))

Number of all sentences 14765
True number of false statements 974
Classified number of false statements 1022 (104.9%)


### Dependency Parsing

In [4]:
def to_deps(doc):
    oh_vectors = []
    for token in doc:
        vec = np.zeros(max_sent_len)
        vec[token.head.i] = 1
        oh_vectors.append(vec)
        
    # padding with 0 vectors to max sentence length
    while len(oh_vectors) < max_sent_len:
        oh_vectors.append(np.zeros(max_sent_len))
    return oh_vectors

for d in data:
    doc = spacy_model(d['org'])
    d['deps'] = to_deps(doc)

### Dependency Parsing tests

In [5]:
doc = spacy_model(data[0]['org'])
print(doc)

for t in doc:
    print(t.text)
    print(t.head.i)

from spacy import displacy
displacy.render(doc, style='dep')

Prozess beginnt: Mord an Freiburger StudentinProzessbeginn gegen den mutmaßlichen Mörder Hussein Khavari am Dienstag, 05. September 2017STUDENTIN
Prozess
1
beginnt
1
:
1
Mord
1
an
3
Freiburger
6
StudentinProzessbeginn
4
gegen
6
den
10
mutmaßlichen
10
Mörder
7
Hussein
12
Khavari
10
am
3
Dienstag
13
,
3
05.
17
September
3
2017STUDENTIN
17


In [6]:
print(data[0]['dependencies'])

KeyError: 'dependencies'

### Word Embedding

In [7]:
word_vectors = w2v_model.wv

def embed(sentence):
    vectorized_sentence = []
    vector_dim = w2v_model.wv.vector_size
    for word in sentence:
        if word in w2v_model.wv:
            vectorized_sentence.append(w2v_model.wv[word])
        else:
            vectorized_sentence.append(np.zeros(vector_dim))
            
    # padding with 0 vectors to max sentence length
    while len(vectorized_sentence) < max_sent_len:
        vectorized_sentence.append(np.zeros(vector_dim))
        
    return vectorized_sentence

for d in data:
    d['vectors'] = embed(d['tokenized_lower'])

In [8]:
X = np.array([np.array([np.concatenate((word, deps), axis=0) for (word, deps) in zip(d['vectors'], d['deps'])]) for d in data])
Y = np.array([1 if d['lbl'] == True else 0 for d in data])

### Model without ranking loss

In [37]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
        
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2 - Get the attention scores
            hidden = LSTM(64, return_sequences=True)(words)
            sentence, word_scores = Attention(return_attention=True)(hidden)
        """
        self.supports_masking = True
        self.return_attention = return_attention
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        
        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * K.expand_dims(a)

        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]

In [None]:
word_len = X[0].size
hidden_units = 100

model = Sequential()
model.add(InputLayer(input_shape=(max_sent_len, word_len)))
model.add(LSTM(hidden_units, return_sequences = True))
model.add(Attention())
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])