# <font color="#49699E" size=40>Variational Bayes and the Craft of Generative Topic Modelling</font>

# LEARNING OBJECTIVES
# LEARNING MATERIALS
# INTRODUCTION


# GENERATIVE TOPIC MODELS


## Latent Dirichlet Allocation (LDA)


## LDA as a Graphical Model


## The Dirichlet in Latent Dirichlet Allocation


### Understanding the $\alpha$ hyperparameter
### Understanding the $\eta$ hyperparameter


## Variational Inference


## Selecting the Number of Topics


# TOPIC MODELLING WITH GENSIM

In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
pd.set_option("display.notebook_repr_html", False)
from dcss.plotting import custom_seaborn
custom_seaborn()

from dcss.text import preprocess, bow_to_df
 
from gensim import corpora
from pprint import pprint
from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pickle

In [ ]:
df = pd.read_csv('../data/canadian_hansards/lipad/canadian_hansards.csv', low_memory=False)

In [ ]:
df.info()

In [ ]:
texts = df['speechtext'].tolist()
processed_text = preprocess(texts, bigrams=False, detokenize=False, n_process = 32)

In [ ]:
len(processed_text)

In [ ]:
with open('../data/pickles/preprocessed_speeches_canadian_hansards_no_bigrams.pkl', 'wb') as handle:
    pickle.dump(processed_text, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [ ]:
processed_text = pickle.load( open( '../data/pickles/preprocessed_speeches_canadian_hansards_no_bigrams.pkl', 'rb'))

#### Creating a Bag-of-Words with Gensim

In [ ]:
vocab = corpora.Dictionary(processed_text) # id2word
vocab.save('../models/lda_vocab.dict')

In [ ]:
vocab = corpora.Dictionary.load('../models/lda_vocab.dict')
vocab.filter_extremes(no_below=20, no_above=0.95)
corpus = [vocab.doc2bow(text) for text in processed_text]

In [ ]:
len(vocab)

## Running the Topic Model

In [ ]:
import random
random.seed(100)

In [ ]:
sample_corpus, sample_text = zip(*random.sample(list(zip(corpus,processed_text)),100000))

In [ ]:
ldamod_s = LdaModel(corpus=sample_corpus,
                      id2word=vocab,
                      num_topics=100,
                      random_state=100,
                      eval_every=1,
                      chunksize=2000,
                      alpha='auto',
                      eta='auto',
                      passes=2,
                      update_every=1,
                      iterations=400
                  )

In [ ]:
with open('../data/pickles/lda_model_sample.pkl', 'wb') as handle:
    pickle.dump(ldamod_s, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [ ]:
ldamod_s = pickle.load(open( '../data/pickles/lda_model_sample.pkl', 'rb'))

In [ ]:
ldamod_s.get_term_topics('freedom')

In [ ]:
ldamod_s.show_topic(53)

In [ ]:
ldamod_s.get_term_topics('criminal')

In [ ]:
ldamod_s.show_topic(20)

In [ ]:
ldamod_s.get_term_topics('marriage')

In [ ]:
ldamod_s.show_topic(28, topn=30)

### Evaluating the Quality of Topic Models by Measuring Semantic Coherence


In [ ]:
coherence_model_s = CoherenceModel(model=ldamod_s, 
                                     texts=sample_text, 
                                     dictionary=vocab, 
                                     coherence='c_v')

coherence_lda_s = coherence_model_s.get_coherence()
print('Coherence Score: ', coherence_lda_s)

In [ ]:
with open('../data/pickles/coherence_model_sample.pkl', 'wb') as handle:
    pickle.dump(coherence_model_s, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [ ]:
coherence_model_s = pickle.load( open( '../data/pickles/coherence_model_sample.pkl', 'rb'))

In [ ]:
topic_coherence_s = coherence_model_s.get_coherence_per_topic(with_std = True)
topic_coherence_df = pd.DataFrame(topic_coherence_s, columns = ['coherence','std'])
topic_coherence_df = topic_coherence_df.sort_values(['coherence', 'std'], ascending=[False,True])

In [ ]:
topic_coherence_df.head(10).mean()

In [ ]:
topic_coherence_df.tail(10).mean()

### Going Further with Better Priors

In [ ]:
alpha_asym = np.fromiter(
                    (1.0 / (i + np.sqrt(100)) for i in range(100)),
                    dtype=np.float16, count=100,
                    )
eta_sym = 1/100

In [ ]:
alpha_t = ldamod_s.alpha
eta_t = ldamod_s.eta

In [ ]:
print("Trained alpha variance: " + str(np.round(np.var(alpha_t), 4)))
print("Asymmetric alpha variance: " + str(np.round(np.var(alpha_asym), 4)))
print("Trained alpha avg: " + str(np.round(alpha_t.sum()/len(alpha_t), 4)))
print("Asymmetric alpha avg: " + str(np.round(alpha_asym.sum()/len(alpha_asym), 4)))

print("Trained eta variance: " + str(np.round(np.var(eta_t), 4)))
print("Symmetric eta variance: " + str(np.round(np.var(eta_sym), 4)))
print("Trained eta avg: " + str(np.round(eta_t.sum()/len(eta_t),4)))
print("Symmetric eta avg: " + str(np.round(eta_sym, 4)))

In [ ]:
ldamod_f = LdaMulticore(corpus=corpus,
                      id2word=vocab,
                      num_topics=100,
                      random_state=100,
                      chunksize=2000,
                      alpha=alpha_t,
                      eta=eta_t,
                      passes=1,
                      iterations=10,
                      workers=15,
                      per_word_topics=True)

In [ ]:
with open('../data/pickles/lda_model_full.pkl', 'wb') as handle:
    pickle.dump(ldamod_f, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [ ]:
ldamod_f = pickle.load( open( '../data/pickles/lda_model_full.pkl', 'rb'))

In [ ]:
coherence_model_full = CoherenceModel(model=ldamod_f,
                                     texts=processed_text,
                                     dictionary=vocab,
                                     coherence='c_v')
coherence_full = coherence_model_full.get_coherence()

In [ ]:
with open('../data/pickles/coherence_model_full.pkl', 'wb') as handle:
    pickle.dump(coherence_model_full, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [ ]:
coherence_model_full = pickle.load( open( '../data/pickles/coherence_model_full.pkl', 'rb'))

In [ ]:
coherence_full

In [ ]:
topic_coherence_f = coherence_model_full.get_coherence_per_topic(with_std = True)
topic_coherence_f_df = pd.DataFrame(topic_coherence_f, columns = ['coherence','std'])
topic_coherence_f_df = topic_coherence_f_df.sort_values(['coherence', 'std'], ascending=[False,True])

In [ ]:
print("Full model average coherence top 30 topics: " + str(topic_coherence_f_df['coherence'].head(30).mean()))
print("Sample model average coherence top 30 topics: " + str(topic_coherence_df['coherence'].head(30).mean()))
print("Full model average coherence bottom 30 topics: " + str(topic_coherence_f_df['coherence'].tail(30).mean()))
print("Sample model average coherence bottom 30 topics: " + str(topic_coherence_df['coherence'].tail(30).mean()))

In [ ]:
topic_coherence_f_df.head(10)

In [ ]:
topic_coherence_df.head(10)

### Visualizing Topic Model Output with PyLDAVis


In [ ]:
import pyLDAvis.gensim_models as gensimvis
from pyLDAvis import save_html

In [ ]:
vis = gensimvis.prepare(ldamod_f, corpus, vocab)

In [ ]:
save_html(vis, '../data/misc/ldavis_full_model.html')

# CONCLUSION
## Key Points 
