## Imports:
Import packages and load models

In [None]:
from collections import Counter

import numpy as np
import spacy
from spacy.lang.de.stop_words import STOP_WORDS
from matplotlib import pyplot as plt
import re

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

import pyLDAvis.sklearn

import string

from enum import Enum

nlp = spacy.load('de_core_news_md')


### Party enum

In [None]:
class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5


### Preprocessing and structure:

In [None]:
def all_statistics(party: Party):
    with open('../resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l.rstrip() for l in txt)
    text = nlp(file)
    words = [token.lemma_ for token in text
             if not (token.is_stop
                     or token.is_punct
                     or token.is_space
                     or token.like_num
                     or token.like_url
                     or token.like_email
                     or token.is_currency
                     or token.text.upper() == party.name
                     or token.pos_ == 'VBZ'  # Verb, 3rd person singular present
                     or token.pos_ == 'ADP'  # adposition
                     or token.pos_ == 'PRON'  # pronoun
                     or token.pos_ == 'AUX'  # auxiliary
                     or token.text == '>'
                     or (party == Party.FDP and (token.text == 'Freie' or token.text == 'Demokraten'))
                     or not re.match('[\x00-\x7F]+', token.text))]  # remove non ascii
    word_freq = Counter(words)
    most_freq_nouns = word_freq.most_common(50)
    words_as_text = " ".join([word for word in words])
    return text, words, word_freq, most_freq_nouns, words_as_text

### Gather statistics for all parties

In [None]:
# statistics_tuple = (text, words, word_freq, most_freq_nouns, words_as_text)
statistics = {}

for party in Party:
    statistics[party] = all_statistics(party)

print('Loading statistics is finished!')


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


KeyboardInterrupt: 

### Plotting:

In [None]:
def plotting_png(most_freq_words, name, show_plot=True, save_plot=False):
    ranked_frequencies = []
    x_axis = []
    y_axis = []

    for i, word in zip(range(len(most_freq_words)), most_freq_words):
        ranked_frequencies.insert(i, word)

    i = 0
    for (x, y) in ranked_frequencies:
        x_axis.insert(i, x)
        y_axis.insert(i, y)
        i += 1

    y = np.arange(50)
    plt.rcParams['figure.autolayout'] = True
    fig, ax = plt.subplots()
    ax.bar(y, y_axis, .5, color='blue')
    ax.set_xticks(y, labels=x_axis, rotation=45, ha='right')

    ax.tick_params(labelcolor='white')

    fig.set_figwidth(fig.get_figwidth() * 2)
    plt.title(name, color='white')

    if save_plot:
        plt.savefig('graphs/graph_' + name + '.svg', format='svg')
    if show_plot:
        plt.show()

### Plot the most frequent words for all parties

In [None]:
for party in Party:
    plotting_png(statistics[party][3], party.name)


### Select topics with given model

In [None]:
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
               for i in topic.argsort()[:-top_n - 1:-1]])

### Tokenize and remove stopwords

In [None]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)


def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]

    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    tokens = " ".join([i for i in tokens])
    return tokens

### Run model with party manifesto

In [None]:
def run_models(party: Party, MODEL_LDA=False, MODEL_NMF=False, MODEL_LSI=False):
    with open('../resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l.rstrip() for l in txt)

    vectorizer = CountVectorizer(min_df=.01, max_df=1, lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

    data_vectorizer = vectorizer.fit_transform(map(spacy_tokenizer, file.split("\n\n")))

    num_topics = 20
    model = None
    if MODEL_LDA:
        model = LatentDirichletAllocation(n_components=num_topics, max_iter=20, learning_method='online', verbose=True)
        data_lda = model.fit_transform(data_vectorizer)

        print("LDA Model:")
        selected_topics(model, vectorizer)

    if MODEL_NMF:
        model = NMF(n_components=num_topics)
        data_nmf = model.fit_transform(data_vectorizer)

        print("NMF Model:")
        selected_topics(model, vectorizer)

    if MODEL_LSI:
        model = TruncatedSVD(n_components=num_topics)
        data_lsi = model.fit_transform(data_vectorizer)

        print("LSI Model:")
        selected_topics(model, vectorizer)

    if model is not None:
        pyLDAvis.enable_notebook()
        dash = pyLDAvis.sklearn.prepare(model, data_vectorizer, vectorizer, mds='tsne')
        dash

In [None]:
run_models(Party.FDP, MODEL_LSI=True)