In [119]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
from gensim.models import Phrases
from sklearn import decomposition
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import pandas as pd
import numpy as np
import re

# initialize tokenizer and stopwords
en_stop = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
           'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot',
           'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each',
           'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd",
           "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
           "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
           'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
           'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
           "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
           'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
           'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",
           'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",
           'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
           'yourselves', 'apos', '&apos;m', '&apos;re', '&apos;s', 's', 'I', 'will', 'go', 'get', '(', ')', '?', ':', ';', ',', '.', '!',
           '/', '"', "'", "...","``", "&apos", "&apos;s", "&apos;&apos;", "-lsb-", "-rsb-", "-lcb-", "-rcb-", "-lrb-", "-rrb-",
           "O&apos;MALLEY", "--", " "]
en_stop_stories = ['(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'", "...","``", "-lsb-", "-rsb-", "-lcb-", "-rcb-", "-lrb-", "-rrb-", "--", " "]

stop_chars = ['<', '>']

# get all lemmas between a <story>-</story>-pair:
stories = []
lemma_stories = []

with open('3-4-5_story_test.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            stories.append(story)
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)


with open('3-4-5_lemma_test.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            lemma_stories.append(story)
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)

# create dictionary and wordcounts corpus:
dictionary = corpora.Dictionary(lemma_stories)
#print(dictionary.token2id)
#dictionary.save("wordcounts.dict")

print(len(dictionary))


# Bag-of-words representation of the stories.
corpus = [dictionary.doc2bow(story) for story in lemma_stories]
#print(corpus)
#corpora.MmCorpus.serialize("corpus.mm", corpus)
print(len(corpus))
print(len(stories))
print(len(lemma_stories))

# create tf.idf model:
tfidf_model = models.TfidfModel(corpus)
#tfidf_model.save("tfidf_model")
tfidf_corpus = tfidf_model[corpus]
#print(tfidf_corpus)
#print(tfidf_corpus.chunksize)
# corpora.MmCorpus.serialize("tfidf_corpus.mm", tfidf_corpus)
#
# # create topic models:
# LDA
num_topics = 100

lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=num_topics, update_every=0, chunksize=5000, passes=20)
#lda_model.save("lda_model")
lda_corpus = lda_model[tfidf_corpus]
#corpora.MmCorpus.serialize("lda_corpus.mm", lda_corpus)

print("\nTopics by Latent Dirichlet Allocation model")
topics_found_lda = lda_model.print_topics(num_topics=5, num_words=10)
all_topics = lda_model.print_topics(num_topics=100, num_words=10)

# print(topics_found_lda)



counter = 1

for t in topics_found_lda:
    print("Topic #{} {}".format(counter, t))
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    print(words)
    counter += 1

topics = []
for t in all_topics:
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    topics.append(words)

print(topics)
topics.extend(("Stories with actual words", "Number of subelements in stories with actual words", "Stories with lemmas",
               "Number of subelements in stories with lemmas"))
len(topics)



28090
3798
3798
3798

Topics by Latent Dirichlet Allocation model
Topic #1 (66, '0.006*"book" + 0.005*"Rick" + 0.004*"human" + 0.004*"aleve" + 0.004*"choose" + 0.004*"heartburn" + 0.004*"choice" + 0.004*"crime" + 0.003*"eye" + 0.003*"Tanya"')
book Rick human aleve choose heartburn choice crime eye Tanya
Topic #2 (18, '0.008*"Paige" + 0.004*"Mario" + 0.004*"Fiorina" + 0.004*"Carly" + 0.004*"explode" + 0.003*"Charleston" + 0.003*"assault" + 0.003*"hotel" + 0.002*"Evan" + 0.002*"10:30"')
Paige Mario Fiorina Carly explode Charleston assault hotel Evan 10:30
Topic #3 (34, '0.015*"jackpot" + 0.014*"powerball" + 0.013*"million" + 0.013*"400" + 0.011*"drawing" + 0.009*"$" + 0.009*"prize" + 0.007*"Wednesday" + 0.007*"lottery" + 0.005*"Jackpot"')
jackpot powerball million 400 drawing $ prize Wednesday lottery Jackpot
Topic #4 (35, '0.012*"Avery" + 0.007*"documentary" + 0.007*"Netflix" + 0.007*"avery" + 0.006*"Steven" + 0.006*"murder" + 0.005*"viewer" + 0.005*"petition" + 0.005*"pardon" + 0.004*"

104

In [120]:
lda_df = pd.DataFrame(columns=range(100))

for i in range(len(lemma_stories)):
    doc = lda_corpus[i]
    for top, prob in doc:
        lda_df.set_value(i, top, prob)
print(doc)

with open("3-4-5_Output.txt", "w") as text_file:
    text_file.write("Doc: %s" % (doc))



[(16, 0.25743585570059729), (65, 0.56170294158152745)]


In [63]:
#lda_df.to_csv("1-2-3_df.csv")
#lda_df
# with open("1-2-3_lemma.txt", "w") as lemma_file:
#     for item in lemma_stories:
#         lemma_file.write("%s\n" % item)
# with open("1-2-3_story.txt", "w") as story_file:
#     for item in stories:
#         story_file.write("%s\n" % item)

In [128]:
lemma_stories2 = [x for x in lemma_stories if x]
stories2 = [x for x in stories if x]

with open("3-4-5_new_lemma.txt", "w") as lemma_file:
    for item in lemma_stories2:
        lemma_file.write("%s\n" % item)
with open("3-4-5_new_story.txt", "w") as story_file:
    for item in stories2:
        story_file.write("%s\n" % item)
print(len(lemma_stories2))
print(len(stories2))
print(len(lda_df))




3731
3734
3731


In [134]:
# print("stories:")
# for x in stories2:
#     if set(x).issubset(en_stop)== True:
#         print(x)
#     elif len(x) < 3:
#         print(x)

#del stories2[1585]
print(stories2[1000])


['Extreme', 'weather', 'Cleanup', 'efforts', 'way', 'Illinois', 'Missouri', 'floodwater', 'pumped', 'overflowing', 'roadways', 'In', 'Alexander', 'County', 'Illinois', '100', 'homes', 'flooded', 'levees', 'strain', 'keep', 'water', 'spilling', 'Then', 'St.', 'Louis', 'flooding', 'concerns', 'real', 'swollen', 'Mississippi', 'River', 'least', '31', 'people', 'died', 'state', 'due', 'historic', 'flooding', 'Those', 'headlines', 'See', 'You', 'shortly', 'Monday', 'morning', 'Water', 'water', 'everywhere', 'Thank', 'much']


In [111]:

lda_df[-1] = stories2
lda_df[-2] = lda_df[-1].apply(lambda x: len(x))
lda_df[-3] = lemma_stories2 # Not a good indicator
lda_df[-4] = lda_df[-3].apply(lambda x: len(x))

2961

In [118]:

lda_df.loc[-1] = topics  # adding a row

lda_df.index = lda_df.index + 1  # shifting index

lda_df = lda_df.sort_index() # moving up

lda_df.reset_index(drop=True, inplace=True)
lda_df.to_csv('3-4-5_test.csv')