In [1]:
import pandas as pd
import pickle

In [2]:
with open('tokenizedText','rb') as f:
    docs = pickle.load(f)

In [5]:
from gensim import corpora

def create_corpus(docs):
    dictionary = corpora.Dictionary(docs)
    print('Number of unique words in initital documents:', len(dictionary))

    dictionary.filter_extremes(no_below=20, no_above=0.75)
    print('Number of unique words after removing rare and common words:', len(dictionary))
    
    ## term document frequency
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    return corpus,dictionary

In [8]:
%time corpus,dictionary = create_corpus(docs)

Number of unique words in initital documents: 98129
Number of unique words after removing rare and common words: 17522
CPU times: user 45.6 s, sys: 684 ms, total: 46.3 s
Wall time: 47.8 s


In [9]:
dictionary.save('./ModelApril3/dictionary.dic')

In [13]:
#Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[1000:1002]]

[[('avoid', 2),
  ('body', 3),
  ('create', 2),
  ('friend', 1),
  ('less', 1),
  ('long', 1),
  ('make', 6),
  ('simply', 1),
  ('time', 11),
  ('watch', 3),
  ('week', 2),
  ('work', 5),
  ('answer', 1),
  ('base', 2),
  ('different', 1),
  ('finish', 1),
  ('may', 2),
  ('would', 2),
  ('write', 1),
  ('amount', 2),
  ('back', 1),
  ('check', 1),
  ('much', 4),
  ('must', 1),
  ('always', 1),
  ('challenge', 1),
  ('change', 1),
  ('could', 4),
  ('day', 1),
  ('diet', 1),
  ('easy', 1),
  ('eat', 1),
  ('effort', 1),
  ('exercise', 16),
  ('family', 1),
  ('favorite', 2),
  ('feel', 4),
  ('find', 1),
  ('goal', 2),
  ('good', 2),
  ('help', 3),
  ('instead', 3),
  ('journal', 1),
  ('key', 2),
  ('lifestyle', 1),
  ('list', 3),
  ('look', 1),
  ('minute', 1),
  ('often', 1),
  ('participate', 1),
  ('seem', 1),
  ('something', 2),
  ('start', 4),
  ('step', 1),
  ('try', 6),
  ('way', 1),
  ('phone', 1),
  ('right', 1),
  ('set', 4),
  ('add', 1),
  ('ask', 1),
  ('become', 1),
  

In [14]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 42
chunksize = 500 # size of the doc looked at every pass
passes = 100 # number of passes through documents
iterations = 100
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
#temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time ldamodel = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every,per_word_topics=True,random_state=46)

CPU times: user 5h 52min 57s, sys: 3min 12s, total: 5h 56min 9s
Wall time: 3h 19min 22s


# save the model

In [15]:
ldamodel.save('./ModelApril3/ldamodel.model')

In [16]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [17]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Model Coherency

In [20]:
import time
from gensim.models import CoherenceModel

start=time.time()

coherence_model_lda = CoherenceModel(model=ldamodel, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

end=time.time()
print(end-start)


Coherence Score:  0.6156779522369721
1571.254676103592


In [None]:
### Model Perplexity

In [21]:
%time print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  
# a measure of how good the model is. lower the better.


Perplexity:  -9.683993532836034
CPU times: user 5min, sys: 21.4 s, total: 5min 21s
Wall time: 3min 10s
