**Preprocessing models**:
- Spacy model: https://github.com/explosion/spacy-models/releases/tag/de_core_news_sm-2.3.0
- Word2Vec: Can be trained with the **Word2Vec_10kGNAD** notebook

In [55]:
import json
import nltk as nk
from nltk import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
import numpy as np
import spacy

nk.download('punkt')

DATA_PATH = '../data/GermanFakeNC.json'
MODEL_PATH_W2V = '../models/w2v.model'
MODEL_PATH_SPACY = '../models/de_core_news_sm-2.3.0'

# load preprocessing models
w2v_model = Word2Vec.load(MODEL_PATH_W2V)
spacy_model = spacy.load(MODEL_PATH_SPACY, disable=["vocab"])

[nltk_data] Downloading package punkt to /home/stefan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [89]:
def read_data(path):
    with open(path) as json_file:
        return json.load(json_file)

def count_matches(false_statement, sentence):
    count = 0
    sent_copy = sentence[:]
    for w in false_statement:
        if w in sent_copy:
            count += 1
            sent_copy.remove(w)
    return count


data = []
max_sent_len = 0
for article in read_data(DATA_PATH):
    # Concatenate article text
    text = article['Title'] + article['Teaser'] + article['Text']
    
    sentences = spacy_model(text).sents
    article_data = []
    for s in sentences:
        if len(s) > max_sent_len:
            max_sent_len = len(s)
        article_data.append({
            'org': s.text,
            'lbl': True,
            'tokenized': [t.text for t in s],
            'tokenized_lower': [t.text.lower() for t in s]
        })
 
    # Label sentences
    # The sentences matching the most tokens with a false statement will be labeled as False
    false_statements = [article['False_Statement_1'], article['False_Statement_2'], article['False_Statement_3']]     
    for fs in false_statements:
        if fs != '':
            fs_words = word_tokenize(fs, language='german')
            matches = [count_matches(fs_words, s) for s in [d['tokenized'] for d in article_data]]
            m = max(matches)
            max_indexes = [i for i, j in enumerate(matches) if j == m]
            for mi in max_indexes:
                article_data[mi]['lbl'] = False
            
    data = data + article_data

### Labeling tests
#### Options to match fake statements to sentences
* Test if sentence is in fake statement: matched 53.7% of false statements 
* Seperate into word tokens and test if some percetage of words is in a false statement
* Label sentence with most matching words as false statement

In [79]:
tf_stats = 0
for a in read_data(DATA_PATH):
    for number in ['1','2','3']:
        if a['False_Statement_' + number] != '':
            tf_stats += 1
            
cf_stats = len(list(filter(lambda d: not d['lbl'], data))) 
print("Number of all sentences {}".format(len(data)))
print("True number of false statements {}".format(tf_stats))
print("Classified number of false statements {} ({:.1f}%)".format(cf_stats, (cf_stats * 100) / tf_stats))

Number of all sentences 14765
True number of false statements 974
Classified number of false statements 1021 (104.8%)


### Dependency Parsing

In [90]:
def to_deps(doc):
    oh_vectors = []
    for token in doc:
        vec = np.zeros(max_sent_len)
        vec[token.head.i] = 1
        oh_vectors.append(vec)
    return oh_vectors

for d in data:
    doc = spacy_model(d['org'])
    d['dependencies'] = to_one_hot(doc)

### Dependency Parsing tests

In [83]:
doc = spacy_model(data[0]['org'])
print(doc)

for t in doc:
    print(t.text)
    print(t.head.i)

from spacy import displacy
displacy.render(doc, style='dep')

Prozess beginnt: Mord an Freiburger StudentinProzessbeginn gegen den mutmaßlichen Mörder Hussein Khavari am Dienstag, 05. September 2017STUDENTIN
Prozess
1
beginnt
1
:
1
Mord
1
an
3
Freiburger
6
StudentinProzessbeginn
4
gegen
6
den
10
mutmaßlichen
10
Mörder
7
Hussein
12
Khavari
10
am
3
Dienstag
13
,
3
05.
17
September
3
2017STUDENTIN
17


In [91]:
print(data[0]['dependencies'])

[array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.,

### Word Embedding

In [46]:
word_vectors = w2v_model.wv

def embed(sentence):
    vectorized_sentence = []
    vector_dim = w2v_model.wv.vector_size
    for word in sentence:
        if word in w2v_model.wv:
            vectorized_sentence.append(w2v_model.wv[word])
        else:
            vectorized_sentence.append(np.zeros(vector_dim))
    return vectorized_sentence

for d in data:
    d['vectors'] = embed(d['tokenized_lower'])