# topic modeling playground
In this notebook, different concepts for topic modeling will be tested and evaluated.

In [1]:
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# preprocessing
import nltk
nltk.download('stopwords')
import spacy

# Plotting
import pyLDAvis
import pyLDAvis.gensim_models

import numpy as np

from matplotlib import pyplot

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/supelir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2022-02-01 15:06:22.048133: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-01 15:06:22.048160: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Structure

In [2]:
from enum import Enum


class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5



## Preprocessing

In [3]:
nlp = spacy.load('de_core_news_md')
from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file



In [4]:

sections = re.split(r'\n\s*\n', party_text[Party.FDP])

partie_wordbags_mod = []


for section in sections:

    partie_wordbag = gensim.utils.simple_preprocess(section)

    #filter stopwords
    partie_wordbag_spacy = [word for word in partie_wordbag if word not in all_stopwords]

    partie_wordbags_mod.append(partie_wordbag_spacy)


# make bigrams
bigram = gensim.models.Phrases(partie_wordbags_mod, min_count=5, threshold=100)
# make trigrams
trigram = gensim.models.Phrases(bigram[partie_wordbags_mod], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in all_stopwords])
    return texts_out

words_trigrams = make_trigrams(partie_wordbags_mod)
words_lematized = lemmatization(words_trigrams)

id2word = corpora.Dictionary(words_lematized)
texts = words_lematized
corpus = [id2word.doc2bow(text) for text in texts]




In [5]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                             id2word=id2word,
                                             num_topics=10,
                                             )

lda_model.print_topics(num_topics=10, num_words=10)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=words_lematized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.579763206344753

Coherence Score:  0.2819199277450025


## V

In [6]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis





  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
