In [14]:
%load_ext autoreload
%autoreload 2
import sys 
if '/Users/ericliu/Desktop/Latent-Dirichilet-Allocation' not in sys.path: 
    sys.path.append('/Users/ericliu/Desktop/Latent-Dirichilet-Allocation')
import torch as tr 
import numpy as np 
import pandas as pd 
from collections import defaultdict
from pprint import pprint


from sklearn.decomposition import LatentDirichletAllocation 
from src.lda_model import LDASmoothed 
from src.generator import doc_generator 
from src.utils import (
    get_vocab_from_docs, 
    get_np_wct, 
    data_loader,
    text_pipeline, 
    process_documents,
    compute_elbo,
) 
from src.text_pre_processor import (
    remove_accented_chars, 
    remove_special_characters, 
    remove_punctuation,
    remove_extra_whitespace_tabs,
    remove_stopwords,
)
from pprint import pprint 
import copy  

from src.cutils import (
    _dirichlet_expectation_1d, 
    _dirichlet_expectation_2d,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Document Generation

In [2]:
gen = doc_generator(
    M = 3,
    L = 20, 
    topic_prior = tr.tensor([1,1,1,1,1], dtype=tr.double)
)

In [3]:
gen.beta

tensor([[0.1131, 0.0589, 0.0824, 0.1060, 0.0471, 0.0824, 0.0294, 0.1060, 0.0059,
         0.0047, 0.0353, 0.0059, 0.0118, 0.0035, 0.0012, 0.0012, 0.0118, 0.0118,
         0.0236, 0.0236, 0.0177, 0.0177, 0.0041, 0.0047, 0.0353, 0.0059, 0.0118,
         0.0353, 0.0177, 0.0118, 0.0029, 0.0035, 0.0029, 0.0012, 0.0353, 0.0029,
         0.0059, 0.0118, 0.0029, 0.0029],
        [0.0012, 0.0047, 0.0047, 0.0030, 0.0355, 0.0118, 0.0237, 0.0030, 0.0948,
         0.0592, 0.0592, 0.1066, 0.0474, 0.1066, 0.0948, 0.1126, 0.0237, 0.0059,
         0.0059, 0.0118, 0.0178, 0.0178, 0.0041, 0.0047, 0.0047, 0.0237, 0.0118,
         0.0296, 0.0059, 0.0237, 0.0030, 0.0036, 0.0030, 0.0012, 0.0036, 0.0030,
         0.0047, 0.0118, 0.0030, 0.0030],
        [0.0014, 0.0057, 0.0043, 0.0036, 0.0143, 0.0071, 0.0286, 0.0036, 0.0071,
         0.0043, 0.0214, 0.0014, 0.0428, 0.0043, 0.0143, 0.0014, 0.0500, 0.0999,
         0.0999, 0.0571, 0.0857, 0.0857, 0.1285, 0.1285, 0.0057, 0.0014, 0.0143,
         0.0071, 0.0071, 

In [4]:
gen.alpha

Dirichlet(concentration: torch.Size([5]))

In [5]:
gen.theta.shape

torch.Size([3, 5])

In [6]:
gen.theta

tensor([[0.0015, 0.3438, 0.5711, 0.0634, 0.0203],
        [0.2904, 0.1069, 0.0383, 0.2283, 0.3362],
        [0.2239, 0.0208, 0.2862, 0.1523, 0.3168]], dtype=torch.float64)

In [7]:
docs = gen.generate_doc()

Document: 0 | word: 0 -> topic: sport -> word: exercise
Document: 0 | word: 1 -> topic: law -> word: contract
Document: 0 | word: 2 -> topic: art -> word: concert
Document: 0 | word: 3 -> topic: sport -> word: Olympic
Document: 0 | word: 4 -> topic: art -> word: form
Document: 0 | word: 5 -> topic: law -> word: divorce
Document: 0 | word: 6 -> topic: art -> word: Craftsmanship
Document: 0 | word: 7 -> topic: art -> word: concert
Document: 0 | word: 8 -> topic: art -> word: concert
Document: 0 | word: 9 -> topic: art -> word: electricity
Document 0: exercise contract concert Olympic form divorce Craftsmanship concert concert electricity

Document: 1 | word: 0 -> topic: law -> word: court
Document: 1 | word: 1 -> topic: law -> word: court
Document: 1 | word: 2 -> topic: health -> word: immunology
Document: 1 | word: 3 -> topic: science -> word: electricity
Document: 1 | word: 4 -> topic: law -> word: bankrupt
Document: 1 | word: 5 -> topic: law -> word: accuse
Document: 1 | word: 6 -> to

In [8]:
docs

{0: 'exercise contract concert Olympic form divorce Craftsmanship concert concert electricity',
 1: 'court court immunology electricity bankrupt accuse quantum electricity divorce decongestant',
 2: 'asymmetrical astrophysics immunology attorney picture attorney appetite court decongestant immunology'}

In [9]:
#docs_raw_dict, raw_word_2_idx, raw_idx_2_word = data_loader('ap')

In [10]:
result = process_documents(docs, sample=True)

There are 3 documents in the dataset after processing
On average estimated document length is 10.0 words per document after processing
There are 19 unique vocab in the corpus after processing


In [17]:
print(result.keys())

dict_keys(['documents', 'vocab_doc_count_dict', 'vocab_doc_count_array', 'vocab_to_idx', 'idx_to_vocab'])


In [19]:
result['documents']

[['exercise',
  'contract',
  'concert',
  'Olympic',
  'form',
  'divorce',
  'Craftsmanship',
  'concert',
  'concert',
  'electricity'],
 ['court',
  'court',
  'immunology',
  'electricity',
  'bankrupt',
  'accuse',
  'quantum',
  'electricity',
  'divorce',
  'decongestant'],
 ['asymmetrical',
  'astrophysics',
  'immunology',
  'attorney',
  'picture',
  'attorney',
  'appetite',
  'court',
  'decongestant',
  'immunology']]

In [18]:
result['vocab_to_idx']

{'exercise': 0,
 'contract': 1,
 'concert': 2,
 'Olympic': 3,
 'form': 4,
 'divorce': 5,
 'Craftsmanship': 6,
 'electricity': 7,
 'court': 8,
 'immunology': 9,
 'bankrupt': 10,
 'accuse': 11,
 'quantum': 12,
 'decongestant': 13,
 'asymmetrical': 14,
 'astrophysics': 15,
 'attorney': 16,
 'picture': 17,
 'appetite': 18}

In [20]:
doc_vocab_count = np.zeros(
    (
        len(docs), len(result['vocab_to_idx'])
    ),
    dtype = float,
)

for doc_idx, doc in enumerate(result['documents']): 

    for word in doc: 

        vocab_idx = result['vocab_to_idx'][word]
        doc_vocab_count[doc_idx, vocab_idx] += 1 

doc_vocab_count

array([[1., 1., 3., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 2., 2., 1., 1., 1., 1., 1., 0., 0.,
        0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 0., 0., 0., 1., 1., 1.,
        2., 1., 1.]])

In [21]:
doc_vocab_count_df = pd.DataFrame(
    data = doc_vocab_count,
    columns = list(result['vocab_to_idx'].keys())
)
doc_vocab_count_df

Unnamed: 0,exercise,contract,concert,Olympic,form,divorce,Craftsmanship,electricity,court,immunology,bankrupt,accuse,quantum,decongestant,asymmetrical,astrophysics,attorney,picture,appetite
0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0


# Our Model

In [46]:
lda = LDASmoothed(
    docs = result['documents'],
    num_topics = 5, 
    word_ct_dict = result['vocab_doc_count_dict'], 
    num_doc_population = 3,
    word_ct_array = result['vocab_doc_count_array'],
)
print(f'Size of the vocab is {lda.V}')

Topic Dirichlet Prior, Alpha
1

Exchangeable Word Dirichlet Prior, Eta 
1

Var Inf - Word Dirichlet prior, Lambda
(5, 19)

Var Inf - Topic Dirichlet prior, Gamma
(3, 5)

loop phi
looped
double
Var -Inf - Word wise Topic Multinomial/Categorical, Phi
(3, 19, 5)
Size of the vocab is 19


In [29]:
lda.approx_elbo(
    doc_vocab_count,
    sampling=False,
)

-113.91027683348449

- check hyperparameters 

In [31]:
print(lda._alpha_)
print(lda._eta_)

1
1


- check var inf parameteres 

In [47]:
print('lambda')
print(lda._lambda_.shape)
print(lda._lambda_)

lambda
(5, 19)
[[1.04708219 0.98292693 0.97347267 0.97347428 1.16278368 1.07526208
  0.95052839 1.05181933 1.02101821 0.81760009 0.89893338 1.0283693
  1.15026429 0.97429619 0.94330106 1.00778149 0.9378975  0.96782864
  0.9953198 ]
 [0.89475728 1.08105987 0.87968664 0.86986295 1.01644942 1.07222286
  1.01387299 0.98516565 0.96690772 0.95138329 1.10597141 1.02937237
  0.95871485 0.93060329 1.05898791 1.1031785  1.09256024 0.91521059
  0.96611462]
 [0.94959045 0.9782464  1.08000428 1.13828828 1.03320764 0.93363961
  0.99309432 1.16117251 1.00538214 0.96711283 0.97489658 1.03274514
  0.91811143 0.94740917 1.09087585 1.02984852 1.00638968 1.09653088
  0.92820856]
 [0.96431168 1.02652255 1.02295647 0.86193808 0.95525935 0.98065155
  1.03755099 1.02260049 0.98925295 0.81711981 0.99402204 1.00269174
  1.26335968 0.97758574 1.02707522 1.11516717 1.07363499 1.14337395
  0.86316381]
 [0.90101275 0.9411936  0.84973584 1.00352721 0.90766658 1.15954736
  0.92049911 0.96485874 1.0801087  0.87876646 

In [44]:
print('gamma')
print(lda._gamma_)

gamma
[[4.8 4.8 4.8 4.8 4.8]
 [4.8 4.8 4.8 4.8 4.8]
 [4.8 4.8 4.8 4.8 4.8]]


In [55]:
print('phi')
print(lda._phi_)

phi
[[[0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]]

 [[0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.2 0.2 0.2 0.2 0.2]
  [0.  0.  0.  0.  0. ]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.2 0.2 0.2 0.2 0.2]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]]

 [[0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0.  0.  0.  0. ]
  [0.  0

# Target model

In [48]:
sklearn_lda = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    doc_topic_prior= 1,
    topic_word_prior= 1,
)
sklearn_lda._init_latent_vars(n_features = lda.V)

- check hyperparameters

In [49]:
# alpha 
print(f"alpha -> {sklearn_lda.doc_topic_prior}")
print(f"eta -> {sklearn_lda.topic_word_prior}")

alpha -> 1
eta -> 1


- var inf parameters 

In [50]:
print('lambda')
print(sklearn_lda.components_)

lambda
[[1.04708219 0.98292693 0.97347267 0.97347428 1.16278368 1.07526208
  0.95052839 1.05181933 1.02101821 0.81760009 0.89893338 1.0283693
  1.15026429 0.97429619 0.94330106 1.00778149 0.9378975  0.96782864
  0.9953198 ]
 [0.89475728 1.08105987 0.87968664 0.86986295 1.01644942 1.07222286
  1.01387299 0.98516565 0.96690772 0.95138329 1.10597141 1.02937237
  0.95871485 0.93060329 1.05898791 1.1031785  1.09256024 0.91521059
  0.96611462]
 [0.94959045 0.9782464  1.08000428 1.13828828 1.03320764 0.93363961
  0.99309432 1.16117251 1.00538214 0.96711283 0.97489658 1.03274514
  0.91811143 0.94740917 1.09087585 1.02984852 1.00638968 1.09653088
  0.92820856]
 [0.96431168 1.02652255 1.02295647 0.86193808 0.95525935 0.98065155
  1.03755099 1.02260049 0.98925295 0.81711981 0.99402204 1.00269174
  1.26335968 0.97758574 1.02707522 1.11516717 1.07363499 1.14337395
  0.86316381]
 [0.90101275 0.9411936  0.84973584 1.00352721 0.90766658 1.15954736
  0.92049911 0.96485874 1.0801087  0.87876646 0.844645

In [52]:
assert (sklearn_lda.components_ == lda._lambda_).all()

In [53]:
sklearn_lda._approx_bound(
    doc_vocab_count,
    doc_topic_distr = lda._gamma_,
    sub_sampling = False,
)

-113.71880962020522