In [170]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
from gensim.models import Phrases
from sklearn import decomposition
import pyLDAvis.gensim as gensimvis
import pyLDAvis

# initialize tokenizer and stopwords
en_stop = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
           'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot',
           'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each',
           'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd",
           "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
           "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
           'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
           'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
           "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
           'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
           'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",
           'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",
           'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
           'yourselves', 'apos', 's', 'I', 'will', 'go', 'get', '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'", "...",
           "``", "&apos", "&apos;s", "&apos;&apos;"]

stop_chars = ['<', '>']

# get all lemmas between a <story>-</story>-pair:
stories = []
with open('test11.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            stories.append(story)
            story = []
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)



In [171]:
print(stories)

[['rocket'], ['faster', 'gyro', 'copter', 'powerful', 'pizza', 'rat', 'hit', 'harder', 'Holly', 'Holm', 'year', 'countless', 'victim', 'politics', 'sport', 'pop', 'culture', 'music', 'movie', 'take', 'league', 'Super', 'hero', 'actress', 'Melissa', 'Joan', 'Hart', 'CNN', 'quest', 'mean', 'business', 'Richard', 'Quest', 'model', 'reality', 'tv', 'star', 'Carmen', 'Carrera', 'actor', 'singer', 'Titus', 'Burgess', 'morning', 'radio', 'show', 'Bethany', 'Watson', 'CNN', 'Anchor', 'John', 'Berman', 'fusion', 'shut', 'Paul', 'F.', 'Tomkins', 'hln', 'morning', 'express', 'Robin', 'Meade', 'best', 'worst', '2015'], ['welcome', 'Tom', 'foreman', 'use', 'superhero', 'help', 'challenge', 'year', 'exasperating', 'exhilarating', 'comical', 'next', 'hour', 'start', 'best', 'worst', 'world', 'big', 'news'], ['year', 'first', 'think', 'lot', 'people', 'predict', 'lot', 'thing', 'wrong', 'can', 'believe', 'end', 'year', 'already', 'weird', 'weird', 'stuff', 'happen', 'kind', 'horrifying', 'term', 'head

In [172]:
# create dictionary and wordcounts corpus:
dictionary = corpora.Dictionary(stories)
#print(dictionary.token2id)
dictionary.save("wordcounts.dict")
print(len(dictionary))

# Bag-of-words representation of the stories.
corpus = [dictionary.doc2bow(story) for story in stories]
corpora.MmCorpus.serialize("corpus.mm", corpus)
print(len(corpus))
print(len(stories))


1524
14
14


In [173]:
# create tf.idf model:
tfidf_model = models.TfidfModel(corpus)
tfidf_model.save("tfidf_model")
tfidf_corpus = tfidf_model[corpus]
tfidf_corpus
corpora.MmCorpus.serialize("tfidf_corpus.mm", tfidf_corpus)


In [193]:
# create topic models:
# LSI
%time lsi_model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=10) # initialize an LSI transformation
lsi_corpus = lsi_model[tfidf_corpus]
print("\nTopics by Latent Sementic Indexing model")
topics_found_lsi = lsi_model.print_topics(num_topics=10, num_words=10)
counter = 1
for t in topics_found_lsi:
    print("Topic #{} {}".format(counter, t))
    counter += 1


CPU times: user 88.6 ms, sys: 30.2 ms, total: 119 ms
Wall time: 124 ms

Topics by Latent Sementic Indexing model
Topic #1 (0, '0.107*"see" + 0.101*"like" + 0.092*"just" + 0.090*"know" + 0.090*"one" + 0.087*"never" + 0.086*"call" + 0.084*"people" + 0.084*"pope" + 0.082*"wish"')
Topic #2 (1, '-0.277*"flight" + -0.277*"Malaysia" + -0.277*"Air" + -0.268*"indian" + -0.268*"ocean" + -0.207*"discovery" + -0.174*"hell" + -0.174*"dark" + -0.157*"sad" + -0.156*"miss"')
Topic #3 (2, '-0.375*"weird" + -0.189*"lot" + -0.187*"horrifying" + -0.187*"term" + -0.165*"headline" + -0.165*"wrong" + -0.158*"stuff" + -0.152*"predict" + -0.136*"already" + 0.128*"exhilarating"')
Topic #4 (3, '0.160*"Humira" + -0.142*"CNN" + 0.133*"infection" + -0.128*"morning" + 0.109*"febreze" + 0.106*"symptom" + 0.106*"St." + 0.106*"Jude" + 0.106*"treatment" + 0.103*"exasperating"')
Topic #5 (4, '0.201*"foreman" + 0.198*"exasperating" + 0.198*"welcome" + 0.198*"exhilarating" + 0.198*"superhero" + 0.198*"comical" + 0.155*"Tom

In [194]:
# LDA
num_topics = 10

%time lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=num_topics, update_every=0, chunksize=1000, passes=20)
lda_model.save("lda_model")
lda_corpus = lda_model[corpus]
corpora.MmCorpus.serialize("lda_corpus.mm", lda_corpus)

print("\nTopics by Latent Dirichlet Allocation model")
topics_found_lda = lda_model.print_topics(num_topics=10, num_words=10)
counter = 1
for t in topics_found_lda:
    print("Topic #{} {}".format(counter, t))
    counter += 1
    

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.

top_topics = lda_model.top_topics(corpus, num_words=20)
avg_topic_coherence = sum([t[1] for t in top_topics ]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

data_vis_lda = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(data_vis_lda)


CPU times: user 2.8 s, sys: 38.5 ms, total: 2.84 s
Wall time: 2.93 s

Topics by Latent Dirichlet Allocation model
Topic #1 (0, '0.003*"Malaysia" + 0.003*"Air" + 0.003*"flight" + 0.003*"indian" + 0.003*"ocean" + 0.002*"miss" + 0.002*"discovery" + 0.002*"sad" + 0.002*"part" + 0.001*"year"')
Topic #2 (1, '0.002*"pope" + 0.002*"advocate" + 0.002*"refugee" + 0.002*"policy" + 0.001*"see" + 0.001*"never" + 0.001*"car" + 0.001*"police" + 0.001*"snow" + 0.001*"credit"')
Topic #3 (2, '0.001*"guarantee" + 0.001*"credit" + 0.001*"free" + 0.001*"information" + 0.001*"love" + 0.001*"movie" + 0.001*"Star" + 0.001*"see" + 0.001*"make" + 0.001*"rate"')
Topic #4 (3, '0.002*"voter" + 0.002*"Congress" + 0.002*"Clinton" + 0.001*"republican" + 0.001*"outsider" + 0.001*"process" + 0.001*"challenger" + 0.001*"people" + 0.001*"trend" + 0.001*"democratic"')
Topic #5 (4, '0.003*"Humira" + 0.002*"infection" + 0.002*"hell" + 0.002*"dark" + 0.002*"symptom" + 0.002*"treatment" + 0.002*"St." + 0.002*"Jude" + 0.002*"O

In [195]:
# HDP
%time hdp_model = models.hdpmodel.HdpModel(corpus, dictionary, T=10)
hdp_model.save("hdp_model")

print("\nTopics by Hierarchical Dirichlet process model")
topics_found_hdp = hdp_model.print_topics(num_topics=10, num_words=10)
counter = 1
for t in topics_found_hdp:
    print("Topic #{} {}".format(counter, t))
    counter += 1
vis_hdp = gensimvis.prepare(hdp_model, corpus, dictionary)
pyLDAvis.display(vis_hdp)


CPU times: user 68.1 ms, sys: 2.94 ms, total: 71 ms
Wall time: 74.1 ms

Topics by Hierarchical Dirichlet process model
Topic #1 (0, '0.004*high + 0.004*publish + 0.004*legion + 0.004*Court + 0.004*dramas + 0.003*Du + 0.003*see + 0.003*xx + 0.003*offline + 0.003*say')
Topic #2 (1, '0.004*see + 0.003*situation + 0.003*roof + 0.003*people + 0.003*derby + 0.003*male + 0.003*planned + 0.003*attack + 0.003*team + 0.003*crash')
Topic #3 (2, '0.005*Pope + 0.004*Am + 0.004*keep + 0.004*predict + 0.004*forever + 0.004*claim + 0.004*escalate + 0.004*piece + 0.003*Pan + 0.003*social')
Topic #4 (3, '0.004*relatively + 0.004*office + 0.004*Elly + 0.004*see + 0.003*bum-bum-bum-bum + 0.003*febreze + 0.003*obligation + 0.003*publish + 0.003*earthshaking + 0.003*Barack')
Topic #5 (4, '0.005*sense + 0.005*common + 0.005*teach + 0.004*genius + 0.004*brilliant + 0.004*sure + 0.003*check + 0.003*bright + 0.003*b + 0.003*drought')
Topic #6 (5, '0.005*making + 0.004*community + 0.004*Kim + 0.004*deflategate +

_io.TextIOWrapper