In [1]:
import pandas as pd
import pickle

In [2]:
with open('./data/tokenizedTextNoun','rb') as f:
    docs = pickle.load(f)

In [3]:
from gensim import corpora

def create_corpus(docs):
    dictionary = corpora.Dictionary(docs)
    print('Number of unique words in initital documents:', len(dictionary))

    dictionary.filter_extremes(no_below=50, no_above=0.75)
    print('Number of unique words after removing rare and common words:', len(dictionary))
    
    ## term document frequency
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    return corpus,dictionary

In [4]:
%time corpus,dictionary = create_corpus(docs)

Number of unique words in initital documents: 86955
Number of unique words after removing rare and common words: 10053
CPU times: user 22.1 s, sys: 309 ms, total: 22.4 s
Wall time: 22.5 s


In [6]:
dictionary.save('./ModelNoun19/dictionary.dic')

In [7]:
#Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[1000:1002]]

[[('body', 3),
  ('friend', 1),
  ('less', 1),
  ('long', 1),
  ('time', 11),
  ('watch', 1),
  ('week', 2),
  ('work', 2),
  ('different', 1),
  ('finish', 1),
  ('amount', 2),
  ('check', 1),
  ('much', 3),
  ('change', 1),
  ('day', 1),
  ('diet', 1),
  ('easy', 1),
  ('exercise', 10),
  ('family', 1),
  ('favorite', 2),
  ('goal', 2),
  ('good', 2),
  ('help', 1),
  ('journal', 1),
  ('key', 2),
  ('lifestyle', 1),
  ('list', 3),
  ('minute', 1),
  ('something', 2),
  ('step', 1),
  ('way', 1),
  ('phone', 1),
  ('right', 1),
  ('building', 1),
  ('measure', 1),
  ('note', 1),
  ('size', 1),
  ('strength', 1),
  ('water', 5),
  ('avoid', 1),
  ('common', 1),
  ('lot', 1),
  ('arm', 1),
  ('inch', 1),
  ('tape', 1),
  ('content', 1),
  ('helpful', 1),
  ('soft', 1),
  ('start', 1),
  ('stick', 1),
  ('sugar', 1),
  ('computer', 1),
  ('ipod', 1),
  ('lose', 3),
  ('new', 1),
  ('activity', 2),
  ('enough', 2),
  ('member', 1),
  ('tennis', 1),
  ('whatev', 1),
  ('enjoyable', 1),
  

In [8]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 19
chunksize = 500 # size of the doc looked at every pass
passes = 100 # number of passes through documents
iterations = 100
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
#temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time ldamodel = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every,per_word_topics=True,random_state=46)

CPU times: user 1h 53min 31s, sys: 39.5 s, total: 1h 54min 10s
Wall time: 1h 18min 34s


# save the model

In [9]:
ldamodel.save('./ModelNoun19/ldamodel.model')

In [10]:
# Print the Keyword in the 10 topics
from pprint import pprint
#pprint(ldamodel.print_topics())

In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [12]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model Coherency

In [13]:
import time
from gensim.models import CoherenceModel

start=time.time()

coherence_model_lda = CoherenceModel(model=ldamodel, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

end=time.time()
print(end-start)


Coherence Score:  0.6145618527009855
279.1745400428772


In [14]:
### Model Perplexity

In [15]:
%time print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.


Perplexity:  -8.468058716249068
CPU times: user 1min 33s, sys: 3.4 s, total: 1min 36s
Wall time: 1min 13s
