In [1]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
from gensim.models import Phrases
from sklearn import decomposition
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import pandas as pd
import numpy as np
import re

# initialize tokenizer and stopwords
en_stop = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
           'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot',
           'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each',
           'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd",
           "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
           "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
           'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
           'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
           "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
           'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
           'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",
           'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",
           'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
           'yourselves', 'apos', '&apos;m', '&apos;re', '&apos;s', 's', 'I', 'will', 'go', 'get', '(', ')', '?', ':', ';', ',', '.', '!',
           '/', '"', "'", "...","``", "&apos", "&apos;s", "&apos;&apos;", "-lsb-", "-rsb-", "-lcb-", "-rcb-", "-lrb-", "-rrb-",
           "O&apos;MALLEY", "--"]

stop_chars = ['<', '>']

# get all lemmas between a <story>-</story>-pair:
stories = []
lemma_stories = []
with open('1-2-3_story_test.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            stories.append(story)
        elif not any(stop_char in l for stop_char in stop_chars):
                story.append(l)


with open('1-2-3_lemma_test.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            lemma_stories.append(story)
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)

# create dictionary and wordcounts corpus:
dictionary = corpora.Dictionary(lemma_stories)
#print(dictionary.token2id)
#dictionary.save("wordcounts.dict")

print(len(dictionary))


# Bag-of-words representation of the stories.
corpus = [dictionary.doc2bow(story) for story in lemma_stories]
#print(corpus)
#corpora.MmCorpus.serialize("corpus.mm", corpus)
print(len(corpus))
print(len(stories))
print(len(lemma_stories))

# create tf.idf model:
tfidf_model = models.TfidfModel(corpus)
#tfidf_model.save("tfidf_model")
tfidf_corpus = tfidf_model[corpus]
print(tfidf_corpus)
print(tfidf_corpus.chunksize)
# corpora.MmCorpus.serialize("tfidf_corpus.mm", tfidf_corpus)
#
# # create topic models:
# LDA
num_topics = 100

lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=num_topics, update_every=0, chunksize=5000, passes=20)
#lda_model.save("lda_model")
lda_corpus = lda_model[tfidf_corpus]
#corpora.MmCorpus.serialize("lda_corpus.mm", lda_corpus)

print("\nTopics by Latent Dirichlet Allocation model")
topics_found_lda = lda_model.print_topics(num_topics=5, num_words=10)
all_topics = lda_model.print_topics(num_topics=100, num_words=10)

# print(topics_found_lda)



counter = 1

for t in topics_found_lda:
    print("Topic #{} {}".format(counter, t))
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    print(words)
    counter += 1

topics = []
for t in all_topics:
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    topics.append(words)

print(topics)
topics.extend(("Stories with actual words", "Number of subelements in stories with actual words", "Stories with lemmas",
               "Number of subelements in stories with lemmas"))
len(topics)



28918
3174
3174
<gensim.interfaces.TransformedCorpus object at 0x112df66d8>
None

Topics by Latent Dirichlet Allocation model
Topic #1 (9, '0.019*"Parade" + 0.018*"parade" + 0.017*"Pasadena" + 0.017*"Rose" + 0.009*"Boulevard" + 0.008*"chilly" + 0.007*"route" + 0.006*"Colorado" + 0.006*"wind" + 0.006*"Santa"')
Parade parade Pasadena Rose Boulevard chilly route Colorado wind Santa
Topic #2 (93, '0.005*"kidney" + 0.004*"C" + 0.003*"65" + 0.003*"R" + 0.003*"Hepatitis" + 0.002*"Paraguay" + 0.002*"shark" + 0.002*"attraction" + 0.002*"metrohealth" + 0.002*"stabbing"')
kidney C 65 R Hepatitis Paraguay shark attraction metrohealth stabbing
Topic #3 (1, '0.004*"turmoil" + 0.004*"mosque" + 0.003*"France" + 0.003*"Alex" + 0.003*"eat" + 0.002*"Marquardt" + 0.002*"Henry" + 0.002*"yell" + 0.002*"breakfast" + 0.002*"Supreme"')
turmoil mosque France Alex eat Marquardt Henry yell breakfast Supreme
Topic #4 (17, '0.002*"Sports" + 0.002*"struggle" + 0.002*"Jamaicans" + 0.002*"best-known" + 0.002*"wake" + 

104

In [5]:
lda_df = pd.DataFrame(columns=range(100))

for i in range(len(stories)):
    doc = lda_corpus[i]
    for top, prob in doc:
        lda_df.set_value(i, top, prob)
print(doc)
with open("Output.txt", "w") as text_file:
    text_file.write("Doc: %s" % (doc))



[(25, 0.26425516979561792), (28, 0.076466293243102654), (47, 0.062660885554882445), (50, 0.023414826679211648), (67, 0.15269501257254828), (79, 0.016806167017510552), (86, 0.063618879898939579), (96, 0.23016389982209157)]


In [6]:
lda_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,0.0399763,,...,,,,,,,,,0.0362688,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,0.0209594,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [11]:
print(stories)
#len(lda_df)
#lda_df[-1] = stories 
# lda_df[-2] = lda_df[-1].apply(lambda x: len(x))
# lda_df[-3] = lemma_stories # Not a good indicator
# lda_df[-4] = lda_df[-3].apply(lambda x: len(x))



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [None]:

lda_df.loc[-1] = topics  # adding a row

lda_df.index = lda_df.index + 1  # shifting index

lda_df = lda_df.sort_index() # moving up

lda_df.to_csv('1-2-3_test.csv')