# Topic modeling playground
In this notebook, different concepts for topic modeling will be tested and evaluated.

In [7]:
# import libraries used in this notebook

import re

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# preprocessing
import nltk

nltk.download('stopwords')
import spacy

# Plotting
import pyLDAvis.gensim_models

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fabik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Structure

In [8]:
from enum import Enum


class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5

## Preprocessing

In [9]:
nlp = spacy.load('de_core_news_md')
from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('../resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file



In [10]:
def prepare_data(parties: [Party]):
    # get sections
    sections = []
    for partie in parties:
        sections_of_partie = re.split(r'\n\s*\n', party_text[partie])
        sections.extend(sections_of_partie)

    partie_wordbags_mod = []

    for section in sections:
        partie_wordbag = gensim.utils.simple_preprocess(section)

        #filter stopwords
        partie_wordbag_spacy = [word for word in partie_wordbag if word not in all_stopwords]

        partie_wordbags_mod.append(partie_wordbag_spacy)

    # make bigrams / trigrams
    bigram = gensim.models.Phrases(partie_wordbags_mod, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[partie_wordbags_mod], threshold=100)

    bigram_phraser = gensim.models.phrases.Phraser(bigram)
    trigram_phraser = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_phraser[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_phraser[bigram_phraser[doc]] for doc in texts]

    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            texts_out.append(
                [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in all_stopwords])
        return texts_out

    words_trigrams = make_trigrams(partie_wordbags_mod)
    words_lematized = lemmatization(words_trigrams)

    id2word = corpora.Dictionary(words_lematized)
    texts = words_lematized
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, id2word, words_lematized

In [11]:

corpus, id2word, words_lematized = prepare_data([Party.FDP])
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            iterations=125
                                            )

lda_model.print_topics(num_topics=10, num_words=10)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=words_lematized, dictionary=id2word, coherence='u_mass',
                                     corpus=corpus)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.728835422078108

Coherence Score:  -11.763577496472735

Perplexity:  -8.721225105782517

Coherence Score:  -11.79115897457628


## Visualization

In [12]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


  default_term_info = default_term_info.sort_values(


## Other models

In [13]:
lsi_model = gensim.models.lsimodel.LsiModel(corpus=corpus, id2word=id2word, num_topics=10, chunksize=100)

# Compute Coherence Score
coherence_model_lsi = CoherenceModel(model=lsi_model, texts=words_lematized, dictionary=id2word, coherence='u_mass',
                                     corpus=corpus)
coherence_lsi = coherence_model_lsi.get_coherence()
print('\nCoherence Score: ', coherence_lsi)

lsi_model.print_topics(num_topics=10)

  sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,
  sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,
  sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,
  sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,



Coherence Score:  -9.593086051851852


[(0,
  '0.368*"unternehmen" + 0.212*"freiheit" + 0.186*"entwicklung" + 0.179*"nachhaltig" + 0.174*"international" + 0.134*"global" + 0.132*"wettbewerb" + 0.127*"digital" + 0.126*"co" + 0.125*"menschenrechte"'),
 (1,
  '-0.701*"unternehmen" + 0.286*"freiheit" + 0.153*"land" + 0.150*"entwicklung" + 0.118*"global" + 0.112*"nachhaltig" + 0.101*"co" + -0.084*"wettbewerb" + -0.083*"mittler" + 0.076*"bürgerinnen"'),
 (2,
  '-0.343*"freiheit" + 0.294*"beruflich" + 0.270*"bildung" + 0.241*"schulen" + -0.190*"global" + -0.187*"unternehmen" + 0.157*"digital" + -0.152*"land" + -0.151*"entwicklung" + -0.130*"nachhaltig"'),
 (3,
  '-0.541*"co" + 0.188*"beruflich" + -0.187*"wasserstoff" + -0.186*"klimaschutz" + 0.177*"bildung" + 0.141*"schulen" + 0.130*"digital" + -0.126*"erreichen" + -0.120*"zertifikate" + -0.103*"direkt"'),
 (4,
  '0.308*"beruflich" + 0.281*"bildung" + -0.280*"international" + 0.217*"schulen" + -0.140*"antisemitismus" + 0.135*"freiheit" + 0.121*"co" + 0.117*"unternehmen" + -0.100*"

In [15]:
from pprint import pprint

hdp_model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=id2word, )
pprint(hdp_model.show_topics())

# Compute Coherence Score
coherence_model_hdp = CoherenceModel(model=hdp_model, texts=words_lematized, dictionary=id2word, coherence='u_mass',
                                     corpus=corpus)
coherence_hdp = coherence_model_hdp.get_coherence()
print('\nCoherence Score: ', coherence_hdp)

[(0,
  '0.002*inklusion + 0.002*jahrelang + 0.001*klassen + 0.001*antriebe + '
  '0.001*tausender + 0.001*praktiken + 0.001*abschiebung + 0.001*illegale + '
  '0.001*sport + 0.001*widerspruch + 0.001*föderation + 0.001*auszubildenden + '
  '0.001*zurückkehren + 0.001*ausrichten + 0.001*abzuziehenden + '
  '0.001*ausweitung + 0.001*beschäftigung + 0.001*zeitdauer + 0.001*begleiten '
  '+ 0.001*prävention'),
 (1,
  '0.002*erkenntnisse + 0.002*position + 0.002*kritisieren + 0.001*mechanismen '
  '+ 0.001*forschung + 0.001*bestandsflotten + 0.001*schnellstens + '
  '0.001*befugnissen + 0.001*ausstoßen + 0.001*verstetigt + 0.001*werkzeug + '
  '0.001*diensten + 0.001*priorität + 0.001*fischarten + 0.001*vorsorgeprinzip '
  '+ 0.001*fälle + 0.001*sozialpolitik + 0.001*schädliche + 0.001*bedingt + '
  '0.001*hass'),
 (2,
  '0.002*bagatell + 0.002*finanzbehörden + 0.002*auktionsdesign + '
  '0.001*waffenrechts + 0.001*wachsend + 0.001*talentpool + 0.001*asylbewerber '
  '+ 0.001*mehrstaatigkei

# Plot coherence score

In [9]:
from datetime import datetime
import matplotlib.pyplot as plt


def plot_coherence_lda(coherence, max_topics, max_iterations, parties, iteration_intervall=25, save=False):
    corpus, id2word, words_lematized = prepare_data(parties)

    data = {}
    for iterations in range(iteration_intervall, max_iterations, iteration_intervall):
        statistics = {}
        for topics in range(1, max_topics):
            lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=topics,
                                                        iterations=iterations
                                                        )
            coherence_model_lda = CoherenceModel(model=lda_model, texts=words_lematized, dictionary=id2word,
                                                 coherence=coherence, corpus=corpus)
            coherence_lda = coherence_model_lda.get_coherence()
            statistics[topics] = coherence_lda
        data[iterations] = statistics

    for key, value in data.items():
        plt.plot(list(value.keys()), list(value.values()), label=key)
    plt.legend()
    plt.xlabel('Number of topics')
    plt.ylabel('Coherence score')
    plt.title('Coherence score for different number of topics')

    if save:
        plt.savefig(
            f'../diagrams/coherence_score/lda/{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_coherence_score_{coherence}_for_{parties[0]}_{len(parties)}.png')
  
    plt.show()

In [10]:
def plot_coherence_lsi(coherence, max_topics, parties, save=False):
    corpus, id2word, words_lematized = prepare_data(parties)

    statistics = {}
    for topics in range(1, max_topics):
        lsi_model = gensim.models.lsimodel.LsiModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=topics,
                                                    )
        coherence_model_lsi = CoherenceModel(model=lsi_model, texts=words_lematized, dictionary=id2word,
                                             coherence=coherence, corpus=corpus)
        coherence_lsi = coherence_model_lsi.get_coherence()
        statistics[topics] = coherence_lsi

    plt.plot(list(statistics.keys()), list(statistics.values()))
    plt.xlabel('Number of topics')
    plt.ylabel('Coherence score')
    plt.title('Coherence score for different number of topics')

    if save:
        plt.savefig(
            f'../diagrams/coherence_score/lsi/{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_coherence_score_{coherence}_for_{parties[0]}_{len(parties)}.png')
    
    plt.show()

### Plot coherence for all parties

In [None]:
all_parties = [p for p in Party]
cohernce_scores = ['u_mass', 'c_v', 'c_uci', 'c_npmi']
max_topics = 20
max_iterations = 225
for coherence in cohernce_scores:
    # for all parties
    plot_coherence_lda(coherence, max_iterations=max_iterations, max_topics=max_topics, parties=all_parties)
    plot_coherence_lsi(coherence, max_topics=max_topics, parties=all_parties)

    # for specific parties
    for party in all_parties:
        plot_coherence_lda(coherence, max_iterations=max_iterations, max_topics=max_topics, parties=[party])
        plot_coherence_lsi(coherence, max_topics=max_topics, parties=[party])
