# Imports and Loading in the DTM


In [None]:
import os

try:
  import tmtoolkit
except:
  !pip install tmtoolkit
  os.kill(os.getpid(), 9)

In [None]:
try:
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel
except: 
  !pip install tmtoolkit['lda']
  from tmtoolkit.topicmod.tm_lda import compute_models_parallel

In [None]:
import pickle
import scipy.sparse
import logging
import warnings

In [None]:
try:
  from lda import LDA
except: 
  !pip install lda

Collecting lda
  Downloading lda-2.0.0-cp37-cp37m-manylinux1_x86_64.whl (351 kB)
[K     |████████████████████████████████| 351 kB 4.9 MB/s 
[?25hCollecting pbr<4,>=0.6
  Downloading pbr-3.1.1-py2.py3-none-any.whl (99 kB)
[K     |████████████████████████████████| 99 kB 7.2 MB/s 
Installing collected packages: pbr, lda
Successfully installed lda-2.0.0 pbr-3.1.1


In [None]:
working_directory = '/content/drive/MyDrive/MSDS_marketing_text_analytics/master_files/2_topic_modeling'

doc_labels = pickle.load(open('%s/doc_labels.p' % working_directory, 'rb'))
dtm_sm = scipy.sparse.load_npz('%s/small_dtm.npz' % working_directory)
dtm_bg = scipy.sparse.load_npz('%s/big_dtm.npz' % working_directory)

vocab_bg = pickle.load(open('%s/big_vocab.p' % working_directory, 'rb'))
vocab_sm = pickle.load(open('%s/small_vocab.p' % working_directory, 'rb'))

FileNotFoundError: ignored

# Creating Models

In [None]:
# suppress the "INFO" messages and warnings from lda
logger = logging.getLogger('lda')
logger.addHandler(logging.NullHandler())
logger.propagate = False
warnings.filterwarnings('ignore')

In [None]:
# set data to use
dtms = {
    'bigger': dtm_bg,
    'smaller': dtm_sm
}

# and fixed hyperparameters
# Here, alpha represents document-topic density - with a higher alpha, documents
# are made up of more topics, and with lower alpha, documents contain fewer topics.
#Beta represents topic-word density - with a high beta, topics are made up of 
#most of the words in the corpus, and with a low beta they consist of few words.
# https://www.thoughtvector.io/blog/lda-alpha-and-beta-parameters-the-intuition/
lda_params = {
    'n_topics': 16,
    'eta': .01,
    'n_iter': 500,
    'random_state': 20191122,  # to make results reproducible
    'alpha': 1/16
}

models = compute_models_parallel(dtms, constant_parameters=lda_params)

In [None]:
from tmtoolkit.topicmod.model_io import print_ldamodel_topic_words

model_sm = models['smaller'][0][1]
print_ldamodel_topic_words(model_sm.topic_word_, vocab_sm, top_n=5)

In [None]:
model_bg = models['bigger'][0][1]
print_ldamodel_topic_words(model_bg.topic_word_, vocab_bg, top_n=5)