In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import pyLDAvis
pyLDAvis.enable_notebook()

In [2]:
import sqlite3

In [3]:
conn = sqlite3.connect('/var/www/universityJobDay/universityJobDay/database/answers.db')

In [4]:
df = pd.read_sql_query('SELECT * from answers', conn)
conn.close()
df.head()

Unnamed: 0,id,sessionID,f1,f2,f3,f4,f5,f6,date
0,1,2N01KFSNC2RQIZM,1,3,3~0~2,4,1,1,2020-05-20 05:40:56.670853
1,2,EO29HHPJT4CYAO2,0,1,3~1,3,Risiko Datenschutz + Privatsphäre,2,2020-05-20 05:46:35.477319
2,3,2EL8BUZB5J0O873,0,3,6~0,4,"Chancen: Sicherheit, Prävention\nRisiken: der ...",2,2020-05-20 05:53:47.294080
3,4,C1BRP0FGCG6YSHS,0,3,0~1,2,"Chancen: Minimierung von Fehlern\nRisiken: ""En...",2,2020-05-20 06:13:47.718848
4,5,CP5VTIEK8FNC22P,1,3,0~2~1,2,Analyse von vielen unstrukturierten und auf de...,0,2020-05-20 06:24:26.482387


In [5]:
text = df["f5"]

# Text Preprocessing

Document = ein Text <br>
Corpus = Kollektion von dokumenten <br>
Vector = Mathematische Repräsentation des Dokuments <br>
Model = Algorithmus um die Vektoren zu transformieren <br>
<br>
<br>
https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py

In [6]:
import spacy
import de_core_news_sm
from spacy.lang.de.stop_words import STOP_WORDS
from spacy.lemmatizer import Lemmatizer

In [9]:
# load pre learned model for german text
nlp = de_core_news_sm.load()

In [10]:
text[5] # ein Dokument

'KI könnte den Arbeitsalltag erleichtern, indem Vorgänge automatisiert werden. Aber gerade im Bereich Medizin kann es zu riskant sein, KI einzusetzen, da hier Fehler möglicherweise auf die Gesundheit eines Menschens Einfluss haben könnten.'

In [11]:
# feed the documents into the nlp objects
document_list = [nlp(answer) for answer in text]

In [12]:
# delete stop words
without_stop_words = []
for doc in document_list:
    without_stop_words.append([token for token in doc if token.is_stop != True])

In [13]:
# lemmatizing
lemma_list = []
for doc in without_stop_words:
    lemma_list.append([token.lemma_ for token in doc])

In [14]:
lemma_list[:3] # no lda possible without further cleaning

[['1'],
 ['Risiko', 'Datenschutz', '+', 'Privatsphäre'],
 ['Chance',
  ':',
  'Sicherheit',
  ',',
  'Prävention',
  '\n',
  'Risiko',
  ':',
  'Mensch',
  'Hirn',
  'benutzen',
  '-',
  'Autofahren',
  ':',
  'KI',
  'übernehmen',
  'Routenführung',
  ',',
  'Einparken',
  'etc.',
  'System',
  'ausfällen',
  'hinterher',
  'handlungsunfähig']]

In [15]:
# expressions to be cleaned
expressions = [":", "-", "(", ")", "\n", "\n\n", "?", ":","\'", '\"', ".", ",", "'s", "...", "&", "+", "1", "2", "3", "4", "5", "6", "7", "8", "9", ";-)", " ", ";", "/", "z.", "b."]

# acutal cleaning
cleaned_lemma = []
for doc in lemma_list:
    cleaned_lemma.append([token for token in doc if token not in expressions])

In [16]:
#cleaned_lemma[20:]

In [17]:
# convert text to lowercase
low = []
for doc in cleaned_lemma:
    low.append([token.lower() for token in doc])

In [18]:
# terminate empty cells or 1-word cells
final = []
text_list = []
for doc, t in zip(low, text):
    if len(doc)>1:
        final.append(doc)
        text_list.append(t)

In [19]:
# due to the fact that we asked about risks and chances, clean also these two words (else every topic will have these two words in it --> no information value)
final2 = []
for doc in final:
    final2.append([token for token in doc if token not in ["risiko", "chance", "ki"]])

In [20]:
# check length of all arrays:
print("original Text Length: ", text.shape)
print("final2 length: ", len(final2))
print("new original text (without empty and 1-words): ", len(text_list))

original Text Length:  (32,)
final2 length:  29
new original text (without empty and 1-words):  29


In [21]:
final2[:3]

[['datenschutz', 'privatsphäre'],
 ['sicherheit',
  'prävention',
  'mensch',
  'hirn',
  'benutzen',
  'autofahren',
  'übernehmen',
  'routenführung',
  'einparken',
  'etc.',
  'system',
  'ausfällen',
  'hinterher',
  'handlungsunfähig'],
 ['minimierung', 'fehler', 'entmenschlichung', 'prozeß', 'jobverluste']]

## Gensim approach

In [22]:
from gensim import corpora as corp
import gensim
import pyLDAvis.gensim

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [23]:
# define ngrams
bigram = gensim.models.Phrases(final2, min_count = 5, threshold =10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# build ngrams
ngrams = [bigram_mod[doc] for doc in final2]

In [24]:
# check ngrams
#ngrams

In [25]:
# build dictionary
dictionary = corp.Dictionary(ngrams)
print(dictionary)

Dictionary(267 unique tokens: ['datenschutz', 'privatsphäre', 'ausfällen', 'autofahren', 'benutzen']...)


In [26]:
#[dictionary.token2id] # jedes unique wort bekommt eine nummer

In [27]:
# Vektorisieren --> bags of words
bow_corpus_gensim = [dictionary.doc2bow(doc) for doc in ngrams]

In [28]:
numberOfTopics = 3
lda_gensim = gensim.models.LdaModel(bow_corpus_gensim, random_state=1, id2word=dictionary, num_topics = numberOfTopics)

# pyLDAvis
vis_gensim = pyLDAvis.gensim.prepare(lda_gensim, corpus = bow_corpus_gensim, dictionary=dictionary)
vis_gensim

Anscheinend gibts aufgrund der geringen Datenmenge nur 3 Topics. und eines ist eh ein Rest-Topic

In [29]:
"""wordPerTopic = [p for p in lda.print_topics(num_words=3)]
wordPerTopic"""
# top words per topic for gensim
wordPerTopic_gensim = []

for number in range(numberOfTopics):
    temp = []
    for entries in lda_gensim.show_topic(number):
        temp.append(entries[0])
    wordPerTopic_gensim.append(temp)

In [30]:
# plots corpus content
# [lda[corp] for corp in bow_corpus]

topicPerDoc_gensim = []
for corp in bow_corpus_gensim:
    temp = []
    for x in lda_gensim[corp]:
        temp.append([x[0], x[1]])
    topicPerDoc_gensim.append(temp)

# calculate the most probable topic for document
topTopicPerDoc_gensim = []
for topic in topicPerDoc_gensim:
    tempDf = pd.DataFrame(topic)
    idx = tempDf.iloc[:, 1].idxmax()
    topTopicPerDoc_gensim.append(tempDf.iloc[idx, 0])

In [31]:
# show top words per topic incl. probability
pd.DataFrame([p for p in lda_gensim.top_topics(bow_corpus_gensim)])

Unnamed: 0,0,1
0,"[(0.016726749, persönlich), (0.016130833, sehe...",-10.564442
1,"[(0.013934194, arbeit), (0.013794816, mensch),...",-17.552312
2,"[(0.016793981, mensch), (0.012737708, prozeß),...",-17.870275


In [32]:
# produce pandas dataframe for visual inspection

# get probability of each word per topic
lda_gensim_topic2Word = pd.DataFrame(lda_gensim.get_topics(), index = ["topic" + str(i) for i in range(numberOfTopics)], columns = [id for id in dictionary.token2id])

# get the most x common words for one topic
top = 5
lda_gensim_words = pd.DataFrame([row.sort_values(ascending = False).index.tolist()[:top] for _, row in lda_gensim_topic2Word.iterrows()], index = ["topic" + str(i) for i in range(numberOfTopics)])

# get most probable topic for the document
gensim_lda_total = pd.DataFrame(["topic"+ str(word) for word in topTopicPerDoc_gensim])

# prepare for merging
lda_gensim_words_new = lda_gensim_words.reset_index()
lda_gensim_words_new.columns = ["topic"] + ["word"+str(i) for i in range(top)]
gensim_lda_total.columns = ["topic"]

# merge result parts
gensim_lda_total = gensim_lda_total.merge(lda_gensim_words_new, how="left", left_on="topic", right_on="topic")
gensim_lda_total["text"] = text_list

In [33]:
gensim_lda_total.head()

Unnamed: 0,topic,word0,word1,word2,word3,word4,text
0,topic0,arbeit,mensch,vermeidung,datenschutz,sehen,Risiko Datenschutz + Privatsphäre
1,topic1,mensch,prozeß,überwachung,maschine,automatisieren,"Chancen: Sicherheit, Prävention\nRisiken: der ..."
2,topic1,mensch,prozeß,überwachung,maschine,automatisieren,"Chancen: Minimierung von Fehlern\nRisiken: ""En..."
3,topic1,mensch,prozeß,überwachung,maschine,automatisieren,Analyse von vielen unstrukturierten und auf de...
4,topic0,arbeit,mensch,vermeidung,datenschutz,sehen,"KI könnte den Arbeitsalltag erleichtern, indem..."


## SKlearn approach

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn

In [35]:
# create dummy function to trick the countVectorizer
def dummy(doc):
    return doc

In [36]:
# call the Count Vectorizer
cV = CountVectorizer(tokenizer=dummy, preprocessor=dummy)

In [37]:
# fit countvectorizer and transform to bow_corpus
cV.fit(final2) # call ".get_feature_names()" to retrieve feature names (aka dictionary)
bow_corpus_sk = cV.fit_transform(final2)



In [38]:
#number of counts
dic = cV.get_feature_names()
total_counts=np.zeros(len(dic))
for t in bow_corpus_sk:
    total_counts+=t.toarray()[0]

countings = pd.Series(total_counts, index=dic)
countings.describe() # mean of just 1.something tells us, that the words are only countet once most of the time

count    267.000000
mean       1.280899
std        0.760719
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        8.000000
dtype: float64

In [39]:
countings.sort_values(ascending=False)[:10] #most counted word is "mensch"

mensch            8.0
überwachung       4.0
arbeit            4.0
prozeß            4.0
sehen             4.0
maschine          4.0
fehler            4.0
automatisieren    4.0
persönlich        3.0
medizin           3.0
dtype: float64

In [40]:
numberOfTopics = 5
alpha = 0.5 # the higher the more topics in one document
beta = 0.1 # the higher the more words of the corpus are in the topic

#call the lda object
lda_sk = LatentDirichletAllocation(n_components=numberOfTopics, doc_topic_prior=beta, topic_word_prior=alpha, random_state=1)
#fitting
lda_sk.fit(bow_corpus_sk)

# pyLDAvis
vis_sk = pyLDAvis.sklearn.prepare(lda_sk, bow_corpus_sk, cV)
vis_sk


In [41]:
# prepare to populate a pandas dataframe

# probability of each word in a topic
wordPerTopic_sk = pd.DataFrame(lda_sk.components_, index=["topic"+str(num) for num in range(lda_sk.n_components)], columns = cV.get_feature_names())

# top words for each topic
top = 5
topWordPerTopic_sk = pd.DataFrame([[name, rows.sort_values(ascending = False).index.tolist()[:top]] for name, rows in wordPerTopic_sk.iterrows()])

# probability of each topic per document
topicPerDoc_sk = pd.DataFrame(lda_sk.transform(bow_corpus_sk), index = ["commentary" + str(i) for i in range(len(final2))], columns = ["topic" + str(i) for i in range(lda_sk.n_components)])

topTopicPerDoc_sk = topicPerDoc_sk.T.apply(lambda x: x.idxmax())

# merge different parts
merged = pd.DataFrame(topTopicPerDoc_sk).merge(topWordPerTopic_sk, how="left")
merged = pd.concat([merged, pd.Series(text_list)], axis = 1)

# rename
merged.columns = ["topic", "words", "text"]

#split words in seperate cols
merged[["word" + str(n) for n in range(top)]] = pd.DataFrame(merged["words"].tolist())
newSorting = ["topic"] + ["word"+str(i) for i in range(top)] + ["text"]
merged = merged.loc[:, newSorting]

In [42]:
lda_sk.transform(bow_corpus_sk)

array([[0.04000155, 0.83998788, 0.0400019 , 0.04000181, 0.04000686],
       [0.00689681, 0.00689701, 0.97241253, 0.00689681, 0.00689684],
       [0.01818495, 0.92726596, 0.01818314, 0.01818265, 0.01818331],
       [0.01052705, 0.95789247, 0.01052702, 0.01052679, 0.01052666],
       [0.00689724, 0.97241137, 0.00689719, 0.00689719, 0.00689701],
       [0.04000416, 0.04001162, 0.04001219, 0.04000185, 0.83997018],
       [0.00740784, 0.00740791, 0.00740782, 0.0074078 , 0.97036863],
       [0.00645222, 0.97419171, 0.00645211, 0.00645204, 0.00645192],
       [0.02222432, 0.022223  , 0.9111066 , 0.02222317, 0.0222229 ],
       [0.01818372, 0.01818289, 0.01818267, 0.01818263, 0.92726809],
       [0.00606113, 0.006061  , 0.00606111, 0.0060609 , 0.97575587],
       [0.97036898, 0.00740768, 0.00740776, 0.00740774, 0.00740784],
       [0.02222325, 0.911104  , 0.02222348, 0.02222342, 0.02222586],
       [0.00869592, 0.00869608, 0.00869598, 0.00869597, 0.96521605],
       [0.01052702, 0.95788997, 0.

In [157]:
merged.head()

Unnamed: 0,topic,word0,word1,word2,word3,word4,text
0,topic1,fehler,medizin,verselbstständigung,arbeitsplätzen,analyse,Risiko Datenschutz + Privatsphäre
1,topic2,mensch,übernehmen,bereich,system,maschine,"Chancen: Sicherheit, Prävention\nRisiken: der ..."
2,topic1,fehler,medizin,verselbstständigung,arbeitsplätzen,analyse,"Chancen: Minimierung von Fehlern\nRisiken: ""En..."
3,topic1,fehler,medizin,verselbstständigung,arbeitsplätzen,analyse,Analyse von vielen unstrukturierten und auf de...
4,topic1,fehler,medizin,verselbstständigung,arbeitsplätzen,analyse,"KI könnte den Arbeitsalltag erleichtern, indem..."


## sklearn NMF

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [161]:
tfidf_vectorizer=TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)

In [162]:
tfidf = tfidf_vectorizer.fit(final2)
tfidf_transformed = tfidf.transform(final2)
tfidf_names = tfidf.get_feature_names()

In [163]:
numberOfTopics = 3
# calculate nmf
nmf = NMF(n_components=numberOfTopics, random_state=1).fit(tfidf_transformed)

# vis
vis_nmf = pyLDAvis.sklearn.prepare(nmf, tfidf_transformed, tfidf)
vis_nmf

In [174]:
# prepare pd dataframe
# probability of each word in a topic
wordPerTopic_nmf = pd.DataFrame(nmf.components_, index = ["topic" + str(i) for i in range(numberOfTopics)], columns = tfidf.get_feature_names())

# top words for each topic
top = 5
topWordPerTopic_nmf = pd.DataFrame([[name, rows.sort_values(ascending = False).index.tolist()[:top]] for name, rows in wordPerTopic_nmf.iterrows()])

# probability of each topic per document
topicPerDoc_nmf = nmf.transform(tfidf_transformed)
nmf_result = pd.DataFrame(topicPerDoc_nmf)
topTopicPerDoc_nmf = nmf_result.T.apply(lambda x: x.idxmax())
topTopicPerDoc_nmf = pd.DataFrame(["topic"])

In [175]:
topTopicPerDoc_nmf

0     0
1     0
2     2
3     2
4     2
5     0
6     1
7     2
8     2
9     1
10    0
11    2
12    1
13    1
14    0
15    2
16    2
17    0
18    0
19    2
20    0
21    0
22    2
23    0
24    2
25    0
26    0
27    1
28    2
dtype: int64

In [170]:

# put parts together
nmf_merged = pd.DataFrame(topTopicPerDoc_nmf).merge(topWordPerTopic_nmf, how="left")
nmf_merged = pd.concat([nmf_merged, pd.Series(text_list)], axis = 1)

In [172]:
nmf_complete = nmf_result.T.apply(lambda x: x.idxmax())


NameError: name 'complete' is not defined

In [None]:
# prepare to populate a pandas dataframe

# probability of each word in a topic
wordPerTopic_sk = pd.DataFrame(lda_sk.components_, index=["topic"+str(num) for num in range(lda_sk.n_components)], columns = cV.get_feature_names())

# top words for each topic
top = 5
topWordPerTopic_sk = pd.DataFrame([[name, rows.sort_values(ascending = False).index.tolist()[:top]] for name, rows in wordPerTopic_sk.iterrows()])

# probability of each topic per document
topicPerDoc_sk = pd.DataFrame(lda_sk.transform(bow_corpus_sk), index = ["commentary" + str(i) for i in range(len(final2))], columns = ["topic" + str(i) for i in range(lda_sk.n_components)])

topTopicPerDoc_sk = topicPerDoc_sk.T.apply(lambda x: x.idxmax())

# merge different parts
merged = pd.DataFrame(topTopicPerDoc_sk).merge(topWordPerTopic_sk, how="left")
merged = pd.concat([merged, pd.Series(text_list)], axis = 1)

# rename
merged.columns = ["topic", "words", "text"]

#split words in seperate cols
merged[["word" + str(n) for n in range(top)]] = pd.DataFrame(merged["words"].tolist())
newSorting = ["topic"] + ["word"+str(i) for i in range(top)] + ["text"]
merged = merged.loc[:, newSorting]

In [84]:
nmf_merged.to_csv('/home/daniel/git/nmf_sklearn2.csv', sep=";") 

In [176]:
merged.to_csv('/home/daniel/git/lda_sklearn2.csv', sep =";")

In [177]:
gensim_lda_total.to_csv('/home/daniel/git/lda_gensim2.csv', sep = ";")

In [47]:
def text_analysis(numberOfTopics, numberOfTopWords, textData):
    """
    preprocesses text + calculates the LDA with specific number of topics.

    args-
    -numberOfTopics: the number of topics over all documents
    -numberOfTopWords: the number of top words to be displayed
    -textData: DataFrame of text documents

    returns [pyLDAvis.thml, pd.DataFrame all results, DataFrame for wordPerTopic]
    """

    # text preprocessing
    # initialize nlp
    nlp = de_core_news_sm.load()

    # feed the document into the object
    document_list = [nlp(answer) for answer in textData]

    # delete stop words
    without_stop_words = []
    for doc in document_list:
        without_stop_words.append([token for token in doc if token.is_stop != True])

    # lemmatize
    lemma_list = []
    for doc in without_stop_words:
        lemma_list.append([token.lemma_ for token in doc])

    # clean expressions
    expressions = [":", "-", "(", ")", "\n", "\n\n", "?", ":","\'", '\"', ".", ",", "'s", "...", "&", "+", "1", "2", "3", "4", "5", "6", "7", "8", "9", ";-)", " ", ";", "/", "z.", "b."]

    # acutal cleaning
    cleaned_lemma = []
    for doc in lemma_list:
        cleaned_lemma.append([token for token in doc if token not in expressions])

    # convert text to lowercase
    low = []
    for doc in cleaned_lemma:
        low.append([token.lower() for token in doc])

    # terminate empty cells or 1-word cells
    final = []
    text_list = []
    for doc, t in zip(low, textData):
        if len(doc)>1:
            final.append(doc)
            text_list.append(t)

    # cleane non informative words
    final2 = []
    for doc in final:
        final2.append([token for token in doc if token not in ["risiko", "chance", "ki"]])

    # text mining!
    # call vectorizer
    cV = CountVectorizer(tokenizer=dummy, preprocessor=dummy)

    # fit vecotrizer
    cV.fit(final2)

    # create bow corpus
    bow_corpus_sk = cV.transform(final2)

    # LDA
    alpha = 0.5 # the higher the more topics in one document
    beta = 0.1 # the higher the more words of the corpus are in the topic

    #call the lda object
    lda_sk = LatentDirichletAllocation(n_components=numberOfTopics, doc_topic_prior=beta, topic_word_prior=alpha, random_state=1)
    #fitting
    lda_sk.fit(bow_corpus_sk)

    # pyLDAvis
    vis_sk = pyLDAvis.sklearn.prepare(lda_sk, bow_corpus_sk, cV)
    vis_html = pyLDAvis.prepared_data_to_html(vis_sk)

    #prepare the pd.DataFrame!
    # probability of each word in a topic
    wordPerTopic_sk = pd.DataFrame(lda_sk.components_, index=["topic"+str(num) for num in range(lda_sk.n_components)], columns = cV.get_feature_names())

    # top words for each topic
    top = numberOfTopWords
    topWordPerTopic_sk = pd.DataFrame([[name, rows.sort_values(ascending = False).index.tolist()[:top]] for name, rows in wordPerTopic_sk.iterrows()])

    # probability of each topic per document
    topicPerDoc_sk = pd.DataFrame(lda_sk.transform(bow_corpus_sk), index = ["commentary" + str(i) for i in range(len(final2))], columns = ["topic" + str(i) for i in range(lda_sk.n_components)])

    topTopicPerDoc_sk = topicPerDoc_sk.T.apply(lambda x: x.idxmax())

    # merge different parts
    merged = pd.DataFrame(topTopicPerDoc_sk).merge(topWordPerTopic_sk, how="left")
    merged = pd.concat([merged, pd.Series(text_list)], axis = 1)

    # rename
    merged.columns = ["topic", "words", "text"]

    #split words in seperate cols
    merged[["word" + str(n) for n in range(top)]] = pd.DataFrame(merged["words"].tolist())
    newSorting = ["topic"] + ["word"+str(i) for i in range(top)] + ["text"]
    merged = merged.loc[:, newSorting]

    topWords = topWordPerTopic_sk.iloc[:, 0]
    topWords = pd.concat([topWords, pd.DataFrame(topWordPerTopic_sk.iloc[:,1].tolist(), columns = ["word " + str(n+1) for n in range(top)])], axis = 1)
    topWords.rename(columns={0:"Topic"}, inplace=True)

    return [vis_html, merged, topWords]

In [48]:
result = text_analysis(4, 5, text)

0                                                     1
1                     Risiko Datenschutz + Privatsphäre
2     Chancen: Sicherheit, Prävention\nRisiken: der ...
3     Chancen: Minimierung von Fehlern\nRisiken: "En...
4     Analyse von vielen unstrukturierten und auf de...
5     KI könnte den Arbeitsalltag erleichtern, indem...
6                                       gläserne Mensch
7     Ich bin kein Freund von KI und würde es auch n...
8                                                     1
9     Chancen: Schnelligkeit, Entlastung bei (lästig...
10    um ein vereinfachtes Leben mit einer Krankheit...
11    dazu habe ich leider keinerlei Meinung. Ich se...
12    Risiken: \nAbgabe von Kontrolle durch den Mens...
13    Chancen: \nGroße Fortschritte in Wissenschaft/...
14    Generell darin Abläufe zu vereinfachen und eff...
15    Für mich persönlich sehe ich keine Chancen von...
16    der gläserne Mensch, Privatsphäre, Abschaffung...
17    Chancen: Erleichterung der Arbeit durch sc

In [None]:
#https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [196]:
#https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [197]:
#https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619

In [None]:
#https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730