In [45]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
from gensim.models import Phrases
from sklearn import decomposition
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import pandas as pd
import numpy as np
import re

# initialize tokenizer and stopwords
en_stop = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
           'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot',
           'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each',
           'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd",
           "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
           "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
           'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
           'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
           "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
           'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
           'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",
           'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",
           'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
           'yourselves', 'apos', '&apos;m', '&apos;re', '&apos;s', 's', 'I', 'will', 'go', 'get', '(', ')', '?', ':', ';', ',', '.', '!',
           '/', '"', "'", "...","``", "&apos", "&apos;s", "&apos;&apos;", "-lsb-", "-rsb-", "-lcb-", "-rcb-", "-lrb-", "-rrb-",
           "O&apos;MALLEY", "--", " "]
en_stop_stories = ['(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'", "...","``", "-lsb-", "-rsb-", "-lcb-", "-rcb-", "-lrb-", "-rrb-", "--", " "]

stop_chars = ['<', '>']

# get all lemmas between a <story>-</story>-pair:
stories = []
lemma_stories = []

with open('1-2-3_story_test.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            stories.append(story)
        elif not any(stop_char in l for stop_char in stop_chars):
                story.append(l)


with open('1-2-3_lemma_test.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            lemma_stories.append(story)
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)

# create dictionary and wordcounts corpus:
dictionary = corpora.Dictionary(lemma_stories)
#print(dictionary.token2id)
#dictionary.save("wordcounts.dict")

print(len(dictionary))


# Bag-of-words representation of the stories.
corpus = [dictionary.doc2bow(story) for story in lemma_stories]
#print(corpus)
#corpora.MmCorpus.serialize("corpus.mm", corpus)
print(len(corpus))
print(len(stories))
print(len(lemma_stories))

# create tf.idf model:
tfidf_model = models.TfidfModel(corpus)
#tfidf_model.save("tfidf_model")
tfidf_corpus = tfidf_model[corpus]
print(tfidf_corpus)
print(tfidf_corpus.chunksize)
# corpora.MmCorpus.serialize("tfidf_corpus.mm", tfidf_corpus)
#
# # create topic models:
# LDA
num_topics = 100

lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=num_topics, update_every=0, chunksize=5000, passes=20)
#lda_model.save("lda_model")
lda_corpus = lda_model[tfidf_corpus]
#corpora.MmCorpus.serialize("lda_corpus.mm", lda_corpus)

print("\nTopics by Latent Dirichlet Allocation model")
topics_found_lda = lda_model.print_topics(num_topics=5, num_words=10)
all_topics = lda_model.print_topics(num_topics=100, num_words=10)

# print(topics_found_lda)



counter = 1

for t in topics_found_lda:
    print("Topic #{} {}".format(counter, t))
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    print(words)
    counter += 1

topics = []
for t in all_topics:
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    topics.append(words)

print(topics)
topics.extend(("Stories with actual words", "Number of subelements in stories with actual words", "Stories with lemmas",
               "Number of subelements in stories with lemmas"))
len(topics)



28918
3174
3174
3174
<gensim.interfaces.TransformedCorpus object at 0x11b7a7fd0>
None

Topics by Latent Dirichlet Allocation model
Topic #1 (11, '0.003*"newsroom" + 0.003*"PSH" + 0.002*"Dix" + 0.002*"margin" + 0.002*"clergyman" + 0.001*"rage" + 0.001*"justice" + 0.001*"malnutrition" + 0.001*"Ft." + 0.001*"wanders"')
newsroom PSH Dix margin clergyman rage justice malnutrition Ft. wanders
Topic #2 (32, '0.006*"pastor" + 0.004*"Klemack" + 0.003*"theraflu" + 0.003*"expressmax" + 0.002*"Rachel" + 0.002*"Cadiz" + 0.002*"inferno" + 0.002*"aski" + 0.002*"lion" + 0.002*"Elton"')
pastor Klemack theraflu expressmax Rachel Cadiz inferno aski lion Elton
Topic #3 (35, '0.007*"equal" + 0.006*"inferno" + 0.006*"stock" + 0.005*"bank" + 0.005*"wine" + 0.005*"growth" + 0.004*"ally" + 0.004*"8,000" + 0.003*"release" + 0.003*"S&amp;P"')
equal inferno stock bank wine growth ally 8,000 release S&amp;P
Topic #4 (50, '0.006*"Blue" + 0.004*"listeria" + 0.004*"Bell" + 0.004*"outbreak" + 0.004*"pope" + 0.003*"bro

104

In [46]:
lda_df = pd.DataFrame(columns=range(100))

for i in range(len(lemma_stories)):
    doc = lda_corpus[i]
    for top, prob in doc:
        lda_df.set_value(i, top, prob)
print(doc)

with open("1-2-3_Output.txt", "w") as text_file:
    text_file.write("Doc: %s" % (doc))



[(3, 0.15140280788508031), (18, 0.02110060898899134), (24, 0.2142175298386137), (29, 0.46049301790890557), (82, 0.043935332843486705), (92, 0.012203885130569782)]


In [47]:
#lda_df.to_csv("1-2-3_df.csv")
#lda_df
with open("1-2-3_story.txt", "w") as story_file:
    for item in stories:
        story_file.write("%s\n" % item)
with open("1-2-3_lemma.txt", "w") as lemma_file:
    for item in lemma_stories:
        lemma_file.write("%s\n" % item)

In [48]:
lemma_stories2 = [x for x in lemma_stories if x != []]
stories2 = [x for x in stories if x

with open("1-2-3_new_lemma.txt", "w") as lemma_file:
    for item in lemma_stories2:
        lemma_file.write("%s\n" % item)
with open("1-2-3_new_story.txt", "w") as story_file:
    for item in stories2:
        story_file.write("%s\n" % item)
print(len(lemma_stories2))
print(len(stories2))
print(len(lda_df))




SyntaxError: invalid syntax (<ipython-input-48-70456e868288>, line 4)

In [38]:
lda_df[-1] = lemma_stories2
# lda_df[-2] = lda_df[-1].apply(lambda x: len(x))
# lda_df[-3] = lemma_stories # Not a good indicator
# lda_df[-4] = lda_df[-3].apply(lambda x: len(x))

In [None]:

lda_df.loc[-1] = topics  # adding a row

lda_df.index = lda_df.index + 1  # shifting index

lda_df = lda_df.sort_index() # moving up

lda_df.to_csv('1-2-3_test.csv')