In [56]:
%load_ext autoreload
%autoreload 2
import sys 
if '/Users/ericliu/Desktop/Latent-Dirichilet-Allocation' not in sys.path: 
    sys.path.append('/Users/ericliu/Desktop/Latent-Dirichilet-Allocation')
import torch as tr 
import numpy as np 
import pandas as pd 
from collections import defaultdict
from pprint import pprint


from sklearn.decomposition import LatentDirichletAllocation 
from src.lda_model import LDASmoothed 
from src.generator import doc_generator 
from src.utils import (
    get_vocab_from_docs, 
    get_np_wct, 
    data_loader,
    text_pipeline, 
    process_documents,
    compute_elbo,
) 
from src.text_pre_processor import (
    remove_accented_chars, 
    remove_special_characters, 
    remove_punctuation,
    remove_extra_whitespace_tabs,
    remove_stopwords,
)
from pprint import pprint 
import copy  

from src.cutils import (
    _dirichlet_expectation_1d, 
    _dirichlet_expectation_2d,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Document Generation

In [64]:
gen = doc_generator(
    M = 5,
    L = 20, 
    topic_prior = tr.tensor([1,1,1,1,1], dtype=tr.double)
)

In [65]:
gen.beta

tensor([[0.1131, 0.0589, 0.0824, 0.1060, 0.0471, 0.0824, 0.0294, 0.1060, 0.0059,
         0.0047, 0.0353, 0.0059, 0.0118, 0.0035, 0.0012, 0.0012, 0.0118, 0.0118,
         0.0236, 0.0236, 0.0177, 0.0177, 0.0041, 0.0047, 0.0353, 0.0059, 0.0118,
         0.0353, 0.0177, 0.0118, 0.0029, 0.0035, 0.0029, 0.0012, 0.0353, 0.0029,
         0.0059, 0.0118, 0.0029, 0.0029],
        [0.0012, 0.0047, 0.0047, 0.0030, 0.0355, 0.0118, 0.0237, 0.0030, 0.0948,
         0.0592, 0.0592, 0.1066, 0.0474, 0.1066, 0.0948, 0.1126, 0.0237, 0.0059,
         0.0059, 0.0118, 0.0178, 0.0178, 0.0041, 0.0047, 0.0047, 0.0237, 0.0118,
         0.0296, 0.0059, 0.0237, 0.0030, 0.0036, 0.0030, 0.0012, 0.0036, 0.0030,
         0.0047, 0.0118, 0.0030, 0.0030],
        [0.0014, 0.0057, 0.0043, 0.0036, 0.0143, 0.0071, 0.0286, 0.0036, 0.0071,
         0.0043, 0.0214, 0.0014, 0.0428, 0.0043, 0.0143, 0.0014, 0.0500, 0.0999,
         0.0999, 0.0571, 0.0857, 0.0857, 0.1285, 0.1285, 0.0057, 0.0014, 0.0143,
         0.0071, 0.0071, 

In [66]:
gen.alpha

Dirichlet(concentration: torch.Size([5]))

In [67]:
gen.theta.shape

torch.Size([5, 5])

In [68]:
gen.theta

tensor([[0.0446, 0.2003, 0.4756, 0.0900, 0.1896],
        [0.0206, 0.3647, 0.1089, 0.0235, 0.4823],
        [0.5233, 0.0665, 0.0771, 0.1263, 0.2068],
        [0.0402, 0.2183, 0.1981, 0.4762, 0.0672],
        [0.2672, 0.0198, 0.4494, 0.0683, 0.1953]], dtype=torch.float64)

In [69]:
docs = gen.generate_doc()

Document: 0 | word: 0 -> topic: art -> word: Technique
Document: 0 | word: 1 -> topic: art -> word: Symmetrical
Document: 0 | word: 2 -> topic: sport -> word: FIFA
Document: 0 | word: 3 -> topic: art -> word: Craftsmanship
Document: 0 | word: 4 -> topic: sport -> word: research
Document: 0 | word: 5 -> topic: art -> word: game
Document: 0 | word: 6 -> topic: sport -> word: recreation
Document: 0 | word: 7 -> topic: law -> word: attorney
Document: 0 | word: 8 -> topic: law -> word: accuse
Document: 0 | word: 9 -> topic: law -> word: contract
Document 0: Technique Symmetrical FIFA Craftsmanship research game recreation attorney accuse contract

Document: 1 | word: 0 -> topic: law -> word: divorce
Document: 1 | word: 1 -> topic: sport -> word: physical
Document: 1 | word: 2 -> topic: law -> word: court
Document: 1 | word: 3 -> topic: law -> word: accuse
Document: 1 | word: 4 -> topic: art -> word: concert
Document: 1 | word: 5 -> topic: law -> word: attorney
Document: 1 | word: 6 -> topic

In [70]:
docs

{0: 'Technique Symmetrical FIFA Craftsmanship research game recreation attorney accuse contract',
 1: 'divorce physical court accuse concert attorney football divorce contagious court',
 2: 'contagious scientst scientst divorce scientst concert contract electricity scientst court',
 3: 'attorney decongestant form athletics FIFA athletics decongestant game bruise injection',
 4: 'electricity concert Technique game Craftsmanship asymmetrical fever Craftsmanship asymmetrical contract'}

In [71]:
#docs_raw_dict, raw_word_2_idx, raw_idx_2_word = data_loader('ap')

In [72]:
result = process_documents(docs, sample=True)

There are 5 documents in the dataset after processing
On average estimated document length is 10.0 words per document after processing
There are 25 unique vocab in the corpus after processing


In [73]:
print(result.keys())

dict_keys(['documents', 'vocab_doc_count_dict', 'vocab_doc_count_array', 'vocab_to_idx', 'idx_to_vocab'])


In [74]:
result['documents']

[['Technique',
  'Symmetrical',
  'FIFA',
  'Craftsmanship',
  'research',
  'game',
  'recreation',
  'attorney',
  'accuse',
  'contract'],
 ['divorce',
  'physical',
  'court',
  'accuse',
  'concert',
  'attorney',
  'football',
  'divorce',
  'contagious',
  'court'],
 ['contagious',
  'scientst',
  'scientst',
  'divorce',
  'scientst',
  'concert',
  'contract',
  'electricity',
  'scientst',
  'court'],
 ['attorney',
  'decongestant',
  'form',
  'athletics',
  'FIFA',
  'athletics',
  'decongestant',
  'game',
  'bruise',
  'injection'],
 ['electricity',
  'concert',
  'Technique',
  'game',
  'Craftsmanship',
  'asymmetrical',
  'fever',
  'Craftsmanship',
  'asymmetrical',
  'contract']]

In [75]:
result['vocab_to_idx']

{'Technique': 0,
 'Symmetrical': 1,
 'FIFA': 2,
 'Craftsmanship': 3,
 'research': 4,
 'game': 5,
 'recreation': 6,
 'attorney': 7,
 'accuse': 8,
 'contract': 9,
 'divorce': 10,
 'physical': 11,
 'court': 12,
 'concert': 13,
 'football': 14,
 'contagious': 15,
 'scientst': 16,
 'electricity': 17,
 'decongestant': 18,
 'form': 19,
 'athletics': 20,
 'bruise': 21,
 'injection': 22,
 'asymmetrical': 23,
 'fever': 24}

In [76]:
doc_vocab_count = np.zeros(
    (
        len(docs), len(result['vocab_to_idx'])
    ),
    dtype = float,
)

for doc_idx, doc in enumerate(result['documents']): 

    for word in doc: 

        vocab_idx = result['vocab_to_idx'][word]
        doc_vocab_count[doc_idx, vocab_idx] += 1 

doc_vocab_count

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 2., 1., 2., 1., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1.,
        4., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 2., 1., 2., 1., 1., 0., 0.],
       [1., 0., 0., 2., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 2., 1.]])

In [77]:
doc_vocab_count_df = pd.DataFrame(
    data = doc_vocab_count,
    columns = list(result['vocab_to_idx'].keys())
)
doc_vocab_count_df

Unnamed: 0,Technique,Symmetrical,FIFA,Craftsmanship,research,game,recreation,attorney,accuse,contract,...,contagious,scientst,electricity,decongestant,form,athletics,bruise,injection,asymmetrical,fever
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,2.0,1.0,2.0,1.0,1.0,0.0,0.0
4,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0


# Our Model

In [260]:
lda = LDASmoothed(
    docs = result['documents'],
    num_topics = 5, 
    word_ct_dict = result['vocab_doc_count_dict'], 
    num_doc_population = 3,
    word_ct_array = result['vocab_doc_count_array'],
)
print(f'Size of the vocab is {lda.V}')

Topic Dirichlet Prior, Alpha
1

Exchangeable Word Dirichlet Prior, Eta 
1

Var Inf - Word Dirichlet prior, Lambda
(5, 25)

Var Inf - Topic Dirichlet prior, Gamma
(5, 5)

Size of the vocab is 25


In [261]:
%%time
perplexity, suff_stats = \
    lda.approx_perplexity(
    doc_vocab_count,
    sampling=False,
)

print(perplexity)

-204.37721599278194
59.593369617582205
CPU times: user 954 µs, sys: 211 µs, total: 1.16 ms
Wall time: 988 µs


In [262]:
expec_log_theta, expect_log_beta = suff_stats[0], suff_stats[1]

- check hyperparameters 

In [263]:
print(lda._alpha_)
print(lda._eta_)

1
1


- check var inf parameteres 

In [264]:
print('lambda')
print(lda._lambda_.shape)
print(lda._lambda_)

lambda
(5, 25)
[[1.04708219 0.98292693 0.97347267 0.97347428 1.16278368 1.07526208
  0.95052839 1.05181933 1.02101821 0.81760009 0.89893338 1.0283693
  1.15026429 0.97429619 0.94330106 1.00778149 0.9378975  0.96782864
  0.9953198  0.89475728 1.08105987 0.87968664 0.86986295 1.01644942
  1.07222286]
 [1.01387299 0.98516565 0.96690772 0.95138329 1.10597141 1.02937237
  0.95871485 0.93060329 1.05898791 1.1031785  1.09256024 0.91521059
  0.96611462 0.94959045 0.9782464  1.08000428 1.13828828 1.03320764
  0.93363961 0.99309432 1.16117251 1.00538214 0.96711283 0.97489658
  1.03274514]
 [0.91811143 0.94740917 1.09087585 1.02984852 1.00638968 1.09653088
  0.92820856 0.96431168 1.02652255 1.02295647 0.86193808 0.95525935
  0.98065155 1.03755099 1.02260049 0.98925295 0.81711981 0.99402204
  1.00269174 1.26335968 0.97758574 1.02707522 1.11516717 1.07363499
  1.14337395]
 [0.86316381 0.90101275 0.9411936  0.84973584 1.00352721 0.90766658
  1.15954736 0.92049911 0.96485874 1.0801087  0.87876646 0.8

In [265]:
print('gamma')
print(lda._gamma_)

gamma
[[6. 6. 6. 6. 6.]
 [6. 6. 6. 6. 6.]
 [6. 6. 6. 6. 6.]
 [6. 6. 6. 6. 6.]
 [6. 6. 6. 6. 6.]]


In [266]:
perplexity, suff_stats = \
    lda.approx_perplexity(
    doc_vocab_count,
    sampling=False,
)

print(perplexity)

-204.37721599278194
59.593369617582205


In [267]:
%%time

gamma, suff_stats = \
    lda.e_step_batch(
    X = doc_vocab_count,
    expec_log_theta= expec_log_theta,
    expec_log_beta= expect_log_beta,
    verbose = True,
)

-204.37721599278194
Before Estep perplexity = 59.593369617582205
-202.17223961648543
After Estep perplexity = 57.02243531035659
CPU times: user 1.98 ms, sys: 410 µs, total: 2.39 ms
Wall time: 2.02 ms


# Target model

In [268]:
sklearn_lda = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    doc_topic_prior= 1,
    topic_word_prior= 1,
)
sklearn_lda._init_latent_vars(n_features = lda.V)

- check hyperparameters

In [269]:
# alpha 
print(f"alpha -> {sklearn_lda.doc_topic_prior}")
print(f"eta -> {sklearn_lda.topic_word_prior}")

alpha -> 1
eta -> 1


- var inf parameters 

In [270]:
print('lambda')
print(sklearn_lda.components_)

lambda
[[1.04708219 0.98292693 0.97347267 0.97347428 1.16278368 1.07526208
  0.95052839 1.05181933 1.02101821 0.81760009 0.89893338 1.0283693
  1.15026429 0.97429619 0.94330106 1.00778149 0.9378975  0.96782864
  0.9953198  0.89475728 1.08105987 0.87968664 0.86986295 1.01644942
  1.07222286]
 [1.01387299 0.98516565 0.96690772 0.95138329 1.10597141 1.02937237
  0.95871485 0.93060329 1.05898791 1.1031785  1.09256024 0.91521059
  0.96611462 0.94959045 0.9782464  1.08000428 1.13828828 1.03320764
  0.93363961 0.99309432 1.16117251 1.00538214 0.96711283 0.97489658
  1.03274514]
 [0.91811143 0.94740917 1.09087585 1.02984852 1.00638968 1.09653088
  0.92820856 0.96431168 1.02652255 1.02295647 0.86193808 0.95525935
  0.98065155 1.03755099 1.02260049 0.98925295 0.81711981 0.99402204
  1.00269174 1.26335968 0.97758574 1.02707522 1.11516717 1.07363499
  1.14337395]
 [0.86316381 0.90101275 0.9411936  0.84973584 1.00352721 0.90766658
  1.15954736 0.92049911 0.96485874 1.0801087  0.87876646 0.84464572


In [271]:
assert (sklearn_lda.components_ == lda._lambda_).all()

In [272]:
%%time
sklearn_lda._approx_bound(
    doc_vocab_count,
    doc_topic_distr = lda._gamma_,
    sub_sampling = False,
)

CPU times: user 295 µs, sys: 38 µs, total: 333 µs
Wall time: 301 µs


-202.17223961648543

In [273]:
sklearn_lda._perplexity_precomp_distr(
    doc_vocab_count, 
    doc_topic_distr = lda._gamma_,
    sub_sampling=False)

57.02243531035659

In [274]:
gamma, suff_stats_ref = \
    sklearn_lda._e_step(
    doc_vocab_count,
    cal_sstats = False, 
    random_init = 42,
)

In [275]:
sklearn_lda._approx_bound(
    doc_vocab_count,
    doc_topic_distr = gamma,
    sub_sampling = False,
)

-201.9908836303667

In [276]:
sklearn_lda._perplexity_precomp_distr(
    doc_vocab_count, 
    doc_topic_distr = gamma,
    sub_sampling=False)

56.81598275104014