# topic modeling playground
In this notebook, different concepts for topic modeling will be tested and evaluated.

In [2]:
import re

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# preprocessing
import nltk

nltk.download('stopwords')
import spacy

# Plotting
import pyLDAvis
import pyLDAvis.gensim_models

  and should_run_async(code)
[nltk_data] Downloading package stopwords to /home/delta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Structure

In [3]:
from enum import Enum


class Party(Enum):
    AFD = 0
    CDU = 1
    FDP = 2
    GRUENE = 3
    LINKE = 4
    SPD = 5




  and should_run_async(code)


## Preprocessing

In [5]:
nlp = spacy.load('de_core_news_md')
from nltk.corpus import stopwords
from spacy.lang.de.stop_words import STOP_WORDS

# stopwords
nltk_stopwords = stopwords.words('german')

# build stopwords list
all_stopwords = list(set(STOP_WORDS) | set(nltk_stopwords))
with open('custom_stopwords.txt', 'r', encoding='utf-8') as f:
    all_stopwords += [line.strip() for line in f.readlines()]

# Load files
party_text = {}
for party in Party:
    all_stopwords.extend(['{}'.format(party.name.lower())])
    with open('../resources/' + party.name + '.txt', encoding='utf-8', errors='ignore') as txt:
        file = " ".join(l for l in txt)
        # remove gender *
        file = re.sub(r'\*innen(\w*)\s', r'\1 ', file)
    party_text[party] = file



  and should_run_async(code)


In [6]:
def prepare_data(parties: [Party]):
    # get sections
    sections = []
    for partie in parties:
        sections_of_partie = re.split(r'\n\s*\n', party_text[Party.FDP])
        sections.extend(sections_of_partie)

    partie_wordbags_mod = []

    for section in sections:
        partie_wordbag = gensim.utils.simple_preprocess(section)

        #filter stopwords
        partie_wordbag_spacy = [word for word in partie_wordbag if word not in all_stopwords]

        partie_wordbags_mod.append(partie_wordbag_spacy)

    # make bigrams
    bigram = gensim.models.Phrases(partie_wordbags_mod, min_count=5, threshold=100)
    # make trigrams
    trigram = gensim.models.Phrases(bigram[partie_wordbags_mod], threshold=100)

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent))
            texts_out.append(
                [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in all_stopwords])
        return texts_out

    words_trigrams = make_trigrams(partie_wordbags_mod)
    words_lematized = lemmatization(words_trigrams)

    id2word = corpora.Dictionary(words_lematized)
    texts = words_lematized
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, id2word, words_lematized


  and should_run_async(code)


In [7]:

corpus, id2word, words_lematized = prepare_data([Party.FDP])
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            iterations=100
                                            )

lda_model.print_topics(num_topics=10, num_words=10)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))
# a measure of how good the model is. lower the better.

# TODO change c_v https://www.baeldung.com/cs/topic-modeling-coherence-score
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=words_lematized, dictionary=id2word, coherence='u_mass',
                                     corpus=corpus)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -9.12974646112801

Coherence Score:  -4.595075607779679


##V

In [8]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  and should_run_async(code)
  by='saliency', ascending=False).head(R).drop('saliency', 1)


# other models

In [9]:
lsi_model = gensim.models.lsimodel.LsiModel(corpus=corpus, id2word=id2word, num_topics=10, chunksize=100)

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lsi_model, texts=words_lematized, dictionary=id2word, coherence='u_mass',
                                     corpus=corpus)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

lsi_model.print_topics(num_topics=10)

  and should_run_async(code)



Coherence Score:  -5.359567612732676


[(0,
  '0.288*"unternehmen" + 0.158*"digital" + 0.152*"chancen" + 0.148*"nachhaltig" + 0.142*"entwicklung" + 0.141*"freiheit" + 0.133*"wettbewerb" + 0.133*"digitale" + 0.131*"international" + 0.120*"bildung"'),
 (1,
  '0.357*"bildung" + -0.346*"unternehmen" + 0.296*"schulen" + 0.281*"beruflich" + 0.184*"beruflichen_bildung" + 0.147*"weiterbildung" + 0.140*"hochschulen" + 0.112*"leben" + 0.110*"finanziell" + 0.108*"digital"'),
 (2,
  '-0.437*"unternehmen" + 0.316*"freiheit" + 0.167*"land" + 0.165*"menschenrechte" + -0.164*"altersvorsorge" + -0.135*"gesetzlich" + -0.110*"arbeiten" + 0.108*"global" + 0.104*"nachhaltig" + 0.097*"richtung"'),
 (3,
  '0.234*"unternehmen" + -0.193*"schutz" + 0.153*"bildung" + 0.149*"freiheit" + -0.136*"gewalt" + -0.126*"altersvorsorge" + -0.123*"menschenrechte" + -0.119*"klaren" + -0.114*"polizei" + 0.113*"beruflich"'),
 (4,
  '-0.372*"co" + 0.241*"unternehmen" + 0.208*"freiheit" + -0.191*"klimaschutz" + -0.135*"kraftstoffe" + 0.121*"menschenrechte" + -0.119*

In [10]:
from pprint import pprint

hdp_model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=id2word,)
pprint(hdp_model.show_topics())

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=hdp_model, texts=words_lematized, dictionary=id2word, coherence='u_mass',
                                     corpus=corpus)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)


[(0,
  '0.002*unternehmen + 0.002*leben + 0.001*weltweit + 0.001*wiederum + '
  '0.001*cannabis + 0.001*reklamieren + 0.001*detaillierte + '
  '0.001*internationale + 0.001*lernen + 0.001*artikel + 0.001*aufenthaltes + '
  '0.001*wettbewerb + 0.001*langfristig + 0.001*lehrer + 0.001*schiffen + '
  '0.001*punktesystem + 0.001*betrauen + 0.001*arbeitsplatzes + '
  '0.001*aktienrente + 0.001*legitimieren'),
 (1,
  '0.002*themen + 0.002*ärmelkanals + 0.002*antisemitismus + 0.001*frei + '
  '0.001*tierärzte + 0.001*aufklären + 0.001*kommissarinnen + 0.001*bewältigen '
  '+ 0.001*grundgesetz + 0.001*schutze + 0.001*leistung + 0.001*abgestimmt + '
  '0.001*privat + 0.001*umweltwirkung + 0.001*grund + 0.001*form + '
  '0.001*globalisierung + 0.001*bedenken + 0.001*entsendegesetz + '
  '0.001*märkten'),
 (2,
  '0.002*zustimmung + 0.002*europaweit + 0.002*ansatz + 0.001*abwägungen + '
  '0.001*generationen + 0.001*zukünftig + 0.001*gesprächskanäle + '
  '0.001*erträgen + 0.001*entwicklung + 0.00

# Plot coherence score

In [11]:
from datetime import datetime
import matplotlib.pyplot as plt


def plot_coherence(coherence, max_topics, max_iterations, parties, iteration_intervall=25):
    corpus, id2word, words_lematized = prepare_data(parties)

    data = {}
    for iterations in range(iteration_intervall, max_iterations, iteration_intervall):
        statistics = {}
        for topics in range(1, max_topics):
            lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=topics,
                                                        iterations=iterations
                                                        )
            coherence_model_lda = CoherenceModel(model=lda_model, texts=words_lematized, dictionary=id2word,
                                                 coherence=coherence, corpus=corpus)
            coherence_lda = coherence_model_lda.get_coherence()
            statistics[topics] = coherence_lda
        data[iterations] = statistics

    for key, value in data.items():
        plt.plot(list(value.keys()), list(value.values()), label=key)
    plt.legend()
    plt.xlabel('Number of topics')
    plt.ylabel('Coherence score')
    plt.title('Coherence score for different number of topics')

    plt.savefig(f'{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_coherence_score_{coherence}_for_{parties[0]}_{len(parties)}.png')

  and should_run_async(code)


In [None]:
all_parties = [p for p in Party]

plot_coherence('u_mass', max_iterations=225, max_topics=20, parties=all_parties)
plot_coherence('c_v', max_iterations=225, max_topics=20, parties=all_parties)
plot_coherence('c_uci', max_iterations=225, max_topics=20, parties=all_parties)
plot_coherence('c_npmi', max_iterations=225, max_topics=20, parties=all_parties)

plot_coherence('u_mass', max_iterations=225, max_topics=20, parties=[Party.SPD])
plot_coherence('u_mass', max_iterations=225, max_topics=20, parties=[Party.CDU])
plot_coherence('u_mass', max_iterations=225, max_topics=20, parties=[Party.FDP])
plot_coherence('u_mass', max_iterations=225, max_topics=20, parties=[Party.GRUENE])
plot_coherence('u_mass', max_iterations=225, max_topics=20, parties=[Party.LINKE])


KeyboardInterrupt: 