# This notebook is an interface to conveniently create the run paramters file for either script (Dimensional Reduction or Modeling)

In [3]:
cd ../py_scripts

/Users/r0b00sj/Documents/Capstone/Capstone/py_scripts


In [4]:
import json
from utils_streamers import DirFileMgr

# Dimensional Reduction  

## Enter unique ID string HERE

In [5]:
dr_id_str = 'dr_500k'

In [14]:
dr_fps = DirFileMgr(dr_id_str)
dr_fps.create_all_dr_fps(new_setup='Y')

Which data set will be used? Enter either '5000' or 'full': gpu-full
corpus lst fp is assigned as  ../../outputs-git_ignored/dr_500k/dr_500k_lst.txt
dictionary fp is assigned as  ../../outputs-git_ignored/dr_500k/dr_500k.dict
counts dictionary fp is assigned as  ../../outputs-git_ignored/dr_500k/dr_500k_json.txt
dimensional reduction run parameters fp is assigned as  ../../outputs-git_ignored/dr_500k/dr_500k_dr_run_params.txt


## Set run parameters for the dimensional reduction stage

In [15]:
#Minimum number of documents that a token must appear in (otherwise it is filtered out)
min_freq = 5  #Default 5; set to None to skip frequency filtering

#Maximum percentage of the corpus in which a token can appear (otherwise it is filtered out)
max_freq = 0.5  #Default 0.5

#Number of words to keep in dictionary
keep_n = 500000  #Gensim default = 100000; I changed it here since our docs are so large (books)

#Dataset used
dataset = dr_fps.source_dir

In [16]:
dr_d = { 'min_freq' : min_freq, 'max_freq' : max_freq, 'keep_n' : keep_n, \
       'dataset' : dataset}

In [17]:
json.dump(dr_d, open(dr_fps.dr_run_params, 'w'))

# Modeling

## Enter unique ID string HERE

In [29]:
mod_id_str = 'gpu_mod_run_8'

In [30]:
mod_fps = DirFileMgr(mod_id_str)
mod_fps.create_all_modeling_fps(mod_id_str)

corpus lst fp is assigned as  ../../outputs-git_ignored/gpu_mod_run_8/gpu_mod_run_8_lst.txt
dictionary fp is assigned as  ../../outputs-git_ignored/gpu_mod_run_8/gpu_mod_run_8.dict
counts dictionary fp is assigned as  ../../outputs-git_ignored/gpu_mod_run_8/gpu_mod_run_8_json.txt
model fp is assigned as  ../../outputs-git_ignored/gpu_mod_run_8/gpu_mod_run_8.model
modeling run parameters fp is assigned as  ../../outputs-git_ignored/gpu_mod_run_8/gpu_mod_run_8_mod_run_params.txt


## Set run parameters for the modeling stage

For reference, gensim defaults:
corpus=None, num_topics=100, id2word=None, workers=None, chunksize=2000, passes=1, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, random_state=None, minimum_probability=0.01, minimum_phi_value=0.01, per_word_topics=False, dtype=type 'numpy.float32'

https://radimrehurek.com/gensim/models/ldamulticore.html

In [38]:
#3 * 8 * terms * topics

#Number of topics for LDA to train on
num_topics = 20  #Project default: 50 (for entire corpus)

#Number of cores on the machine running the modeling script MINUS 1
workers = 40 #Common values: 
            #recent macbook=2, 
            #recent macbook pro= 4 or 2
            #recent macbook air=2
            #t2.xlarge ec2=4
            #Spec too few cores and run is computationally sub-optimized
            #Spec too many and run may fail
#Workers will be equal to number of cores - 1

#Number of docs to process at a time
chunksize = 2000  #Default 2000

#Number of training passes for LDA algorithm
passes = 1  #Default 1
#10 might be high, esp for large corpus - I see 1-5 type values

#If batch is not set, perform online training by updating the model once 
#every workers * chunksize documents (online training). 
#Otherwise, run batch LDA, updating model only once at the end of each full corpus pass.
batch = False

#alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) 
#and topic-word (lambda) distributions. Both default to a symmetric 
#1.0/num_topics prior.

#alpha can be set to an explicit array = prior of your choice. It also 
#support special values of ‘asymmetric’ and ‘auto’: the former uses a 
#fixed normalized asymmetric 1.0/topicno prior, the latter learns an 
#asymmetric prior directly from your data.
alpha = 'asymmetric'

#eta can be a scalar for a symmetric prior over topic/word distributions, 
#or a matrix of shape num_topics x num_words, which can be used to impose 
#asymmetric priors over the word distribution on a per-topic basis. This 
#may be useful if you want to seed certain topics with particular words by 
#boosting the priors for those words.
eta = None

#decay and offset parameters are the same as Kappa and Tau_0 in Hoffman 
#et al, respectively.
#http://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation.pdf
decay = 0.5
offset = 1.0

#Calculate and log perplexity estimate from the latest mini-batch once 
#every eval_every documents. Set to None to disable perplexity estimation 
#(faster), or to 0 to only evaluate perplexity once, at the end of each 
#corpus pass.
eval_every = 10

iterations = 50
#https://groups.google.com/forum/#!topic/gensim/aGXc0qiVBhU

#iterations continue until the difference between two consecutive topic (gamma) 
#estimates is less than `gamma_threshold`
gamma_threshold = 0.001
#https://groups.google.com/forum/#!topic/gensim/aGXc0qiVBhU

random_state = None

#controls filtering the topics returned for a document
minimum_probability = 0.01

minimum_phi_value = 0.01

per_word_topic = False

#-----------
#How often to do the maximization step as related to chunk size - for single LDA only
update_every = 0  #Default 0
#See this link for more detail: https://groups.google.com/forum/#!topic/gensim/ojySenxQHi4

In [39]:
mod_d = {'num_topics' : num_topics, \
         'workers' : cores, \
         'chunksize' : chunksize, \
         'passes' : passes, \
         'batch' : batch,\
         'alpha' : alpha, \
         'eta' : eta, \
         'decay' : decay, \
         'offset' : offset, \
         'eval_every' : eval_every, \
         'iterations' : iterations, \
         'gamma_threshold' : gamma_threshold, \
         'random_state' : random_state, \
         'minimum_probability' : minimum_probability, \
         'minimum_phi_value' : minimum_phi_value, \
         'per_word_topic' : minimum_phi_value, \
         'update_every' : update_every }
mod_d

{'alpha': 'asymmetric',
 'batch': False,
 'chunksize': 2000,
 'decay': 0.5,
 'eta': None,
 'eval_every': 10,
 'gamma_threshold': 0.001,
 'iterations': 50,
 'minimum_phi_value': 0.01,
 'minimum_probability': 0.01,
 'num_topics': 20,
 'offset': 1.0,
 'passes': 1,
 'per_word_topic': 0.01,
 'random_state': None,
 'update_every': 0,
 'workers': 41}

In [40]:
json.dump(mod_d, open(mod_fps.mod_run_params, 'w'))

In [35]:
#rule of thumb for memory reqts is 8 bytes per term per topic:
8 * keep_n * num_topics
#if this number is higher than your available memory, need to limit  
#terms and/or topics, or get more memory

NameError: name 'keep_n' is not defined