In [None]:
import numpy as np
import pickle as pkl
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction import text
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF


from nltk import word_tokenize, pos_tag


from gensim import corpora, models, similarities, matutils


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
with open("ScaryStories", 'rb') as picklefile: 
    stories = pkl.load(picklefile)

# LSA/NMF

In [None]:
# displays topics nicely
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
# use this to add stop words to "english" 
def addStopWords(words):
    return text.ENGLISH_STOP_WORDS.union(words)

stop_words = addStopWords(['like', 'know', 'don', 'just', 'got', 'day', 'said', 'room', 'went',
                           'amp', 'time', 'just', 'like', 'com', 'https', 'said', 'thought', 'catalog',
                           'home', 'car', 'didn', 'house', 'door', 'weekly', 'food', 'best', 'friday', 'couldn',
                           'inbox', 'stories', 'week', 'connection', 'series', 'privacy', 'terms', 'weren', 'japan',
                           'aren', 'wasn'
                          ])

In [None]:
# pulls out nouns and adjectives from documents in order to filter important words
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

filteredStories = []
for story in stories:
    new = nouns_adj(story)
    filteredStories.append(new)

In [12]:
# displays topics and returns array of documents by topic percentages with easy to edit parameters
# parameters to edit are range of topic words, number of topics, lsa or nmf, count vectorizer or tfidf,
# documents with all words or just nouns and adjectives, and min and max times a topic word is allowed to appear
def makeVecShowTopics(ngram_range, numTopics, model='lsa', vecType='count_vectorizer', stop_words=stop_words, min_df=1, max_df=1.0, stories=stories):
    count_vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=stop_words,
                                       token_pattern="\\b[a-z][a-z]+\\b", min_df=min_df, max_df=max_df)
    tfidf = TfidfVectorizer(ngram_range=ngram_range, stop_words=stop_words, token_pattern="\\b[a-z][a-z]+\\b",
                            min_df=min_df, max_df=max_df)
    
    if model != 'nmf':
        if vecType == 'tfidf':
            dtm = tfidf.fit_transform(stories)
            lsa = TruncatedSVD(n_components=numTopics)
            lsaVec = lsa.fit_transform(dtm)
            display_topics(lsa, tfidf.get_feature_names(), 8)
            return lsaVec
        else:
            dtm = count_vectorizer.fit_transform(stories)
            lsa = TruncatedSVD(numTopics)
            lsaVec = lsa.fit_transform(dtm)
            display_topics(lsa, count_vectorizer.get_feature_names(), 8)
            return lsaVec
    else:
        if vecType == 'tfidf':
            dtm = tfidf.fit_transform(stories)
            nmf = NMF(n_components=numTopics)
            nmfVec = nmf.fit_transform(dtm)
            display_topics(nmf, tfidf.get_feature_names(), 8)
            return nmfVec
        else:
            dtm = count_vectorizer.fit_transform(stories)
            nmf = NMF(numTopics)
            nmfVec = nmf.fit_transform(dtm)
            display_topics(nmf, count_vectorizer.get_feature_names(), 8)
            return nmfVec


In [20]:
# edit parameters accordingly and find best topic models
makeVecShowTopics((1,1), 5, model='nmf', vecType='tfidf', min_df=6, max_df=10, stories=filteredStories)


Topic  0
driver, cars, shirt, bus, ear, neck, seat, super

Topic  1
table, water, strange, regular, sun, yard, moment, bedroom

Topic  2
apartment, ghost, girlfriend, word, air, father, chair, conversation

Topic  3
grandma, picture, downstairs, alive, father, closet, shut, able

Topic  4
office, mommy, plane, heart, class, fine, dogs, hallway


array([[5.25303857e-02, 1.23495477e-01, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [3.43243266e-02, 0.00000000e+00, 1.54401895e-02, 0.00000000e+00,
        1.01653514e-02],
       [0.00000000e+00, 1.10115006e-02, 5.07609679e-01, 0.00000000e+00,
        0.00000000e+00],
       ...,
       [5.75508714e-02, 1.01113496e-02, 8.06638083e-02, 6.57740415e-02,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.52689116e-02, 8.77937905e-03,
        2.15195221e-01],
       [9.73002230e-03, 0.00000000e+00, 5.20710090e-05, 0.00000000e+00,
        2.80069065e-01]])

In [16]:
# best topic models
threeTopics = makeVecShowTopics((1,2), 3, model='nmf', vecType='tfidf', min_df=2, max_df=6, stories=filteredStories)
fourTopics = makeVecShowTopics((1,2), 4, model='nmf', min_df=2, max_df=6, stories=filteredStories)
sixTopics = makeVecShowTopics((1,1), 6, model='nmf', vecType='tfidf', min_df=3, stories=filteredStories)
fiveTopics = makeVecShowTopics((1,1), 5, model='nmf', vecType='tfidf', min_df=5, max_df=12, stories=filteredStories)


Topic  0
mommy, daddy, men, cousin, coworker, boy, park, little girl

Topic  1
plane, radio, wreck, roommate, today, north, market, state

Topic  2
serial, killer, serial killer, murders, payphone, scariest, calls, hospital

Topic  0
guys, construction, park, trip, fish, distance, grass, city

Topic  1
student, students, staff, afternoon, safety, deserted, answer, rooms

Topic  2
letters, sign, gun, sheriff, dumb, writer, department, box

Topic  3
arm, live, ambulance, porch, neighborhood, fence, scream, injury

Topic  0
guy, way, road, man, truck, night, window, work

Topic  1
board, ouija, friend, spirit, questions, friends, plane, experience

Topic  2
dream, light, friend, wall, sleep, white, thing, previous

Topic  3
old, brother, year, daughter, kitchen, daddy, open, friend

Topic  4
mom, grandma, picture, phone, sister, bathroom, dog, grandparents

Topic  5
girl, dad, little, school, bed, thing, son, people

Topic  0
driver, building, apartment, cars, super, glass, entire, store

In [21]:
# pickle arrays of topic probabilities (could have done this as list of lists)
with open('sixTopics', 'wb') as picklefile:
    pkl.dump(sixTopics, picklefile)

with open('threeTopics', 'wb') as picklefile:
    pkl.dump(threeTopics, picklefile)
    
with open('fourTopics', 'wb') as picklefile:
    pkl.dump(fourTopics, picklefile)
    
with open('fiveTopics', 'wb') as picklefile:
    pkl.dump(fiveTopics, picklefile)

# LDA

In [269]:
# Create a CountVectorizer for parsing/counting words
tfidf = TfidfVectorizer(ngram_range=(1, 1),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")

tfidf.fit(filteredStories)

counts = tfidf.transform(filteredStories).transpose()

In [270]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [271]:
# map number ID to word
id2word = dict((v,k) for k, v in tfidf.vocabulary_.items())

In [272]:
# lda experimenting - not going to work
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=10)
lda.print_topics()

[(0,
  '0.003*"house" + 0.003*"room" + 0.002*"time" + 0.002*"night" + 0.002*"dream" + 0.002*"light" + 0.002*"years" + 0.002*"door" + 0.002*"tv" + 0.002*"cat"'),
 (1,
  '0.002*"board" + 0.001*"spirit" + 0.001*"car" + 0.001*"ground" + 0.001*"thought" + 0.001*"wife" + 0.001*"catalog" + 0.001*"closet" + 0.001*"chris" + 0.001*"cream"'),
 (2,
  '0.003*"car" + 0.003*"old" + 0.003*"door" + 0.003*"guy" + 0.003*"friend" + 0.003*"room" + 0.002*"day" + 0.002*"home" + 0.002*"bed" + 0.002*"year"')]