In [1]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
# laod 20 newsgroups data
def load_dataset(sset, cats):
    if cats==[]:
        newsgroups_dset = fetch_20newsgroups(subset=sset,
                                            remove=('headers', 'footers', 'quotes'),
                                            shuffel=True)
    else: 
        newsgroups_dset = fetch_20newsgroups(subset = sset,
                                             categories=cats,
                                             remove=('headers', 'footers', 'quotes'),
                                             shuffle=True)
    return newsgroups_dset

categories = ["comp.windows.x", "misc.forsale", "rec.autos"]
categories += ["rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey"]
categories += ["sci.crypt", "sci.med", "sci.space"]
categories += ["talk.politics.mideast"]


newsgroups_all = load_dataset('all', categories)

print(len(newsgroups_all.data))

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

In [9]:
# preprocess - import libraries
import nltk
import gensim
from nltk.stem import SnowballStemmer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords

In [10]:
# preprocess - define preprocessing function
stemmer = SnowballStemmer('english')

def stem(text):
    return stemmer.stem(text)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len=4):
        if token not in stopwords:
            result.append(stem(token))
    return result

In [5]:
# preprocess - inspect results
doc_sample = newsgroups_all.data[0]
print('Original document: ')
print(doc_sample)

print('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize(doc_sample):
    words.append(token)
print(words)

print('\n\nPreprocessed document: ')
print(preprocess(doc_sample))

NameError: name 'newsgroups_all' is not defined

In [6]:
# preprocess - inspect output of group of documents
for i in range(0, 10):
    print(str(i) + '\t' + ", ".join(preprocess(newsgroups_all.data[i])[:10])) 

0	xpert, cursor, keyboard, cursor, key, mous, avail, hint, welcom, thank
1	obtain, copi, open, look, widget, obtain, need, order, copi, thank
2	right, signal, strong, live, west, philadelphia, perfect, sport, fan, dream
3	canadian, thing, coach, boston, bruin, colorado, rocki, summari, post, gather
4	heck, feel, like, time, includ, cafeteria, work, half, time, headach
5	damn, right, late, climb, meet, morn, bother, right, foot, asleep
6	olympus, stylus, pocket, camera, smallest, class, includ, time, date, stamp
7	includ, follow, chmos, clock, generat, driver, processor, chmos, eras, prom
8	chang, intel, discov, xclient, xload, longer, work, bomb, messag, error
9	termin, like, power, server, run, window, manag, special, client, program


In [7]:
from tqdm import tqdm #damit kann man sich den Fortschritt des Loops anzeigen

In [1]:
# preprocess - process all documents
processed_docs = []
for i in tqdm(range(0, len(newsgroups_all.data))):
    processed_docs.append(preprocess(newsgroups_all.data[i]))
    
print(len(processed_docs))

print(processed_docs[0:2])



NameError: name 'tqdm' is not defined

In [9]:
# preprocess - convert word content into dictionary
dictionary = gensim.corpora.Dictionary(processed_docs)
print(len(dictionary))

index = 0
for key, value in dictionary.iteritems():
    print(key, value)
    index +=1
    if index > 9:
        break


39350
0 avail
1 cursor
2 hint
3 key
4 keyboard
5 mous
6 thank
7 welcom
8 xpert
9 copi


In [10]:
# preprocess - further dimensionality reduction
dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n = 10000)
print(len(dictionary))

bow_corpus = [dictionary.doc2bow(doc) for doc in tqdm(processed_docs)]
print(bow_corpus[0])

5868


100%|██████████| 9850/9850 [00:00<00:00, 24855.67it/s]

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]





In [11]:
# preprocess - check word stems behind IDs from dictionary

bow_doc = bow_corpus[0]

for i in tqdm(range(len(bow_doc))):
    print(f"Key {bow_doc[i][0]} = \"{dictionary[bow_doc[i][0]]}\": occurrences={bow_doc[i][1]}")

100%|██████████| 9/9 [00:00<00:00, 6316.72it/s]

Key 0 = "avail": occurrences=1
Key 1 = "cursor": occurrences=2
Key 2 = "hint": occurrences=1
Key 3 = "key": occurrences=1
Key 4 = "keyboard": occurrences=1
Key 5 = "mous": occurrences=1
Key 6 = "thank": occurrences=1
Key 7 = "welcom": occurrences=1
Key 8 = "xpert": occurrences=1





In [12]:
# run LDA
id2word = dictionary
corpus = bow_corpus
# jetzt werden Hyperparameter gesetzt
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=10,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=1000,
                                                passes=10,
                                                alpha='symmetric',
                                                iterations=100,
                                                per_word_topics=True)

In [13]:
from gensim.models import CoherenceModel

id2word = dictionary
corpus = bow_corpus

topics = [5, 10, 20, 40]

for i in tqdm(topics): 

    lda_model_i = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=i,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=1000,
                                                    passes=10, #Anzahl der Durchgänge, 10 ist recht wenig, kann mehr
                                                    alpha='symmetric',#wie sich Wörter über die Themen verteilen (geht auch asymetric und auto), aber symetric ist neutraler
                                                    iterations=100,#Anzahl der Durchläufe
                                                    per_word_topics=True)
    u_mass_i = CoherenceModel(model=lda_model_i, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    
    print(u_mass_i.get_coherence())

 25%|██▌       | 1/4 [00:40<02:01, 40.44s/it]

-1.9872220135095524


 50%|█████     | 2/4 [01:20<01:20, 40.32s/it]

-2.116066325049365


 75%|███████▌  | 3/4 [02:03<00:41, 41.40s/it]

-2.4870672483583807


100%|██████████| 4/4 [03:13<00:00, 48.42s/it]

-2.9346177059722134





In [14]:
from gensim.models import CoherenceModel
u_mass = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
c_v = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
print(u_mass.get_coherence())
print(c_v.get_coherence())

-2.116066325049365
0.6309025191282835


In [15]:
for index, topic in lda_model.print_topics(-1):
    print(f"Topic: {index} \nWords: {topic}")

Topic: 0 
Words: 0.021*"encrypt" + 0.018*"secur" + 0.018*"chip" + 0.016*"govern" + 0.013*"clipper" + 0.012*"public" + 0.010*"privaci" + 0.010*"key" + 0.010*"phone" + 0.009*"algorithm"
Topic: 1 
Words: 0.017*"appear" + 0.014*"copi" + 0.013*"cover" + 0.013*"star" + 0.013*"book" + 0.011*"penalti" + 0.010*"black" + 0.009*"comic" + 0.008*"blue" + 0.008*"green"
Topic: 2 
Words: 0.031*"window" + 0.015*"server" + 0.012*"program" + 0.012*"file" + 0.012*"applic" + 0.012*"display" + 0.011*"widget" + 0.010*"version" + 0.010*"motif" + 0.010*"support"
Topic: 3 
Words: 0.015*"space" + 0.007*"launch" + 0.007*"year" + 0.007*"medic" + 0.006*"patient" + 0.006*"orbit" + 0.006*"research" + 0.006*"diseas" + 0.005*"develop" + 0.005*"nasa"
Topic: 4 
Words: 0.018*"armenian" + 0.011*"peopl" + 0.008*"kill" + 0.008*"said" + 0.007*"turkish" + 0.006*"muslim" + 0.006*"jew" + 0.006*"govern" + 0.005*"state" + 0.005*"greek"
Topic: 5 
Words: 0.024*"price" + 0.021*"sale" + 0.020*"offer" + 0.017*"drive" + 0.017*"sell" + 0

In [16]:
# analyze topics - main topics for each document in the collection
def analyse_topics(ldamodel, corpus, text):
    main_topic = {}
    percentage = {}
    keywords = {}
    text_snippets = {}
    
    for i, topic_list in enumerate(ldamodel [corpus]):
        topic = topic_list[0]
        topic = sorted(topic, key = lambda x: (x[1]), reverse = True)
        
        for j, (topic_num, prop_topic) in enumerate(topic):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp[:5]])
                main_topic[i] = int(topic_num)
                percentage[i] = round(prop_topic, 4)
                keywords[i] = topic_keywords
                text_snippets[i] = text[i][:8]
            else:
                break
    return main_topic, percentage, keywords, text_snippets
            

In [17]:
main_topic, percentage, keywords, text_snippets = analyse_topics(lda_model, bow_corpus, processed_docs)

In [18]:
# analyze topics - print out main topic for each document in the collection

indexes = []
rows = []
for i in range(0, 10):
    indexes.append(i)
rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet' ])

for idx in indexes:
    rows.append([str(idx), f"{main_topic.get(idx)}",
                f"{percentage.get(idx):.4f}",
                f"{keywords.get(idx)}\n",
                f"{text_snippets.get(idx)}"])
columns = zip(*rows)
column_width = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join('{:{width}} '.format(row[i], width=column_width[i]) for i in range(0, len(row))))

ID Main Topic Contribution (%) Keywords                               Snippet                                                                           
0  2          0.8268           window, server, program, file, applic
 ['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint']         
1  6          0.4741           mail, list, file, inform, send
        ['obtain', 'copi', 'open', 'look', 'widget', 'obtain', 'need', 'order']           
2  7          0.4230           like, know, time, look, think
         ['right', 'signal', 'strong', 'live', 'west', 'philadelphia', 'perfect', 'sport'] 
3  8          0.4159           game, team, play, year, player
        ['canadian', 'thing', 'coach', 'boston', 'bruin', 'colorado', 'rocki', 'summari'] 
4  9          0.9039           peopl, think, like, time, right
       ['heck', 'feel', 'like', 'time', 'includ', 'cafeteria', 'work', 'half']           
5  7          0.6291           like, know, time, look, think
         ['damn', 'ri

In [19]:
# analyze topics - visualize with pyLDAvis

import pyLDAvis.gensim_models
#pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
#vis

pyLDAvis.save_html(vis, 'lda.html')

  default_term_info = default_term_info.sort_values(
