# topic modeling playground
In this notebook, different concepts for topic modeling will be tested and evaluated.

In [1]:
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# preprocessing
import nltk
nltk.download('stopwords')
import spacy

import tomotopy

# Plotting
import pyLDAvis
import pyLDAvis.gensim_models

import numpy as np

from matplotlib import pyplot

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/supelir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2022-02-01 15:06:15.854162: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-01 15:06:15.854227: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Structure

In [2]:
from enum import Enum


class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5



## HDP

In [3]:
nlp = spacy.load('de_core_news_md')
from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file


In [4]:
for party in Party:
    party_text[party] = [token.lemma_ for token in nlp(party_text[party]) if not token.is_stop and
               not token.is_punct and
               not token.is_space and
               token.pos_ != 'NUM' and
               not token.is_upper]

In [5]:
mdl = tomotopy.HDPModel(tw=tomotopy.TermWeight.ONE, min_cf=5, rm_top=10, alpha=0.1, gamma=1, initial_k=10, seed=99999)

mdl.add_doc(party_text[Party.FDP])

mdl.train(0)
mdl.burn_in = 500

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

Num docs:1, Num Vocabs:758, Total Words:8385
Removed Top words:  Freier Demokrat Deutschland fordern setzen stärken europäisch Mensch schaffen Land


In [6]:
for i in range(0, 5000, 50):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(50)
print('Iteration: {:04}, LL per word: {:.4}'.format(mdl.global_step, mdl.ll_per_word))

mdl.summary()

live_topics = [k for k in range(mdl.k) if mdl.is_live_topic(k)]

topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
topic_term_dists = topic_term_dists[live_topics]
topic_term_dists /= topic_term_dists.sum(axis=1, keepdims=True)

doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_topic_dists = doc_topic_dists[:, live_topics]
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)

doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    start_index=0,
    sort_topics=False
)
pyLDAvis.save_html(prepared_data, 'ldavis.html')

Iteration: 0000, LL per word: -6.855
Iteration: 0050, LL per word: -6.877
Iteration: 0100, LL per word: -6.9
Iteration: 0150, LL per word: -6.906
Iteration: 0200, LL per word: -6.908
Iteration: 0250, LL per word: -6.904
Iteration: 0300, LL per word: -6.927
Iteration: 0350, LL per word: -6.942
Iteration: 0400, LL per word: -6.929
Iteration: 0450, LL per word: -6.932
Iteration: 0500, LL per word: -6.946
Iteration: 0550, LL per word: -6.918
Iteration: 0600, LL per word: -6.93
Iteration: 0650, LL per word: -6.922
Iteration: 0700, LL per word: -6.944
Iteration: 0750, LL per word: -6.942
Iteration: 0800, LL per word: -6.971
Iteration: 0850, LL per word: -6.995
Iteration: 0900, LL per word: -6.954
Iteration: 0950, LL per word: -6.989
Iteration: 1000, LL per word: -6.997
Iteration: 1050, LL per word: -6.987
Iteration: 1100, LL per word: -6.982
Iteration: 1150, LL per word: -6.995
Iteration: 1200, LL per word: -6.983
Iteration: 1250, LL per word: -6.984
Iteration: 1300, LL per word: -7.014
Iter

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
