# Synopsis

Demonstrate generative model of LDA using an already inferred model.

# Configuration

In [79]:
db_file = '/Users/rca2t/CODE/polo2-test/PUB/lsi/lsi-mallet-trial1.db'
corpus_size = 10
doc_size = 50

# Libraries

In [88]:
import sqlite3
import pandas as pd
import numpy as np

# Pragras

In [81]:
%matplotlib inline

# Process

## Import tables from db

In [82]:
with sqlite3.connect(db_file) as db:
    t = pd.read_sql("select topic_id, topic_alpha from topic", db, index_col='topic_id')
    v = pd.read_sql("select word_id, word_str from word", db, index_col='word_id')
    dt = pd.read_sql("select doc_id, topic_id, topic_weight from doctopic", db, index_col=['doc_id','topic_id'])
    wt = pd.read_sql("select word_id, topic_id, word_count from topicword", db, index_col=['word_id','topic_id'])

## Create DocTopic matrix (THETA)

In [83]:
DT = dt.unstack(fill_value=0)
DT.columns = DT.columns.droplevel(0)

In [84]:
DT.head()

topic_id,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000495,0.000576,0.02054,0.000389,0.001083,0.000398,0.000327,0.000163,0.000703,0.000437,...,0.000381,0.00055,0.001205,0.257007,0.00128,0.001678,0.000372,0.00022,0.000305,0.001889
1,0.000476,0.010259,0.000331,0.000374,0.001041,0.000382,0.000314,0.000156,0.000676,0.00042,...,0.000366,0.000529,0.001158,0.24703,0.001231,0.040435,0.000358,0.000211,0.009998,0.07946
2,0.000816,0.000951,0.067197,0.050613,0.001786,0.000656,0.000539,0.000269,0.00116,0.000721,...,0.000628,0.000908,0.001988,0.057509,0.002112,0.002768,0.000614,0.000362,0.000503,0.086403
3,0.000415,0.000484,0.000289,0.000326,0.000908,0.000334,0.000274,0.000137,0.00059,0.000367,...,0.00032,0.000462,0.001011,0.156332,0.001074,0.018352,0.000312,0.000184,0.000256,0.128667
4,0.000662,0.000771,0.013968,0.014027,0.001448,0.000532,0.000437,0.000218,0.00094,0.000585,...,0.00051,0.000736,0.001612,0.087156,0.001713,0.002245,0.000498,0.000294,0.000408,0.070064


##  Create TopicWord Matrix (PHI)

In [85]:
WT = wt.unstack(fill_value=0)
WT.columns = WT.columns.droplevel(0)
WT = WT.apply(lambda x: x / x.sum())

In [86]:
WT.head()

topic_id,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.030749,0.023496,0.005138,0.0,0.0,0.001119,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.001706,0.0,0.0,0.0,0.0,...,0.0,0.000451,0.0,0.001514,0.0,0.000273,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00711
3,0.0,0.0,0.0,0.0,0.0,0.015927,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000679,0.006257
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002339,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004693


## Run generative sequence

In [108]:
alpha = .05
alpha_array = [alpha for _ in range(len(t.index))]

In [109]:
for d in range(corpus_size):
    
    doc_text = []
    doc_topics = t.copy()
    doc_topics['n'] = 0
    
    # Pick the size  of the document
    N = np.random.poisson(doc_size)
    
    # Pick a Dirichlet distribution of topics
    theta = np.random.dirichlet(alpha_array)
    
    # Or: pick an existing distribution
#     theta = DT.sample().iloc[0]
    
    for _ in range(N):
        z = t.sample(weights=theta).index[0]
        phi = WT[z].values
        w = v.sample(weights=phi).index[0]
        w_str = v.loc[w].word_str
        doc_text.append(w_str)
        doc_topics.loc[z, 'n'] += 1
    
    print('-' * 80)
    print("DOC", d, doc_topics[doc_topics.n > 0].sort_values('n',  ascending=False))
    print(' '.join(doc_text))

--------------------------------------------------------------------------------
DOC 0           topic_alpha   n
topic_id                 
39            0.18712  27
13            0.04768  20
14            0.33207   7
22            0.05966   3
necessity qianlong villa currents hideyoshi duty yoshimitsu pioneer commissions painter paralleled ensh garden artist generous muromachi attitude construction preserve toward profound status past architectural replace arrivistes produced would branches mats peculiar palace industrial compositions confusion money ingrained handcut language japan involvement continuity suited sen garden heritage technological new nineteenth yuan practices societies doctrinal footsteps buddhism visualization mus
--------------------------------------------------------------------------------
DOC 1           topic_alpha   n
topic_id                 
21            0.07451  21
39            0.18712  14
5             0.03938   9
14            0.33207   8
midblock least c

## Generate using SQL

The join operator in SQL is like a multiplication operator in probability.

$p(w|\theta, \phi) = \sum_{z}p(w|z,\phi)p(z|\theta)$

$p(w|z,\phi)p(z|\theta) \equiv$

$p(w|t)p(t|w) \equiv$

```
SELECT dt.weight * tw.weight 
FROM wt JOIN td USING(topic_id) 
WHERE doc_id = ?
```

or 

```
SELECT dt.weight * tw.weight 
FROM wt, td 
WHERE wt.topic_id = tw.topic_id AND doc_id = ?
```

In [135]:
sql = """
select word_id, word_str, sum(p_wz)as p_w
from (
    select topic_id, word_id, round((topic_weight * word_p), 8) as p_wz
    from doctopic theta join topicword_v phi using(topic_id)
    where theta.doc_id = ?
)
join word using(word_id)
group by word_id
order by p_W  desc
"""

In [142]:
my_doc_id = 100
with sqlite3.connect(db_file) as db:
    p_wGd = pd.read_sql(sql, db, index_col='word_id', params=(my_doc_id,))

In [143]:
p_wGd.sample(10)

Unnamed: 0_level_0,word_str,p_w
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1
20698,churchmen,3.1e-07
1,necessarily,0.00063399
14480,eleonora,1.8e-07
3114,nostalgic,8.703e-05
5388,interesting,0.00010281
19796,secretary,1.31e-06
2162,anshen,8e-08
11494,southeastnorthwest,2.8e-07
524,centrifugal,0.00038403
3892,disseminated,1.729e-05


In [144]:
for i in range(corpus_size):
    print('DOC',  i)
    N = np.random.poisson(50)
    for j in range(N):
        w = v.sample(weights=p_wGd.p_w)
        print(w.word_str.values[0], end=' ')
    print()
    print('-' * 80)

DOC 0
usuallylocated became type culture integrated time generated thus measurement viewing capitalist uses public states japan also also specific view become suburbs along industrial mile urban throughout owner second focused strong seventeenth role lake design districts highway become three time planning grew metropolitan first need neighborhoods routes technology lowcost native true artists 
--------------------------------------------------------------------------------
DOC 1
new heaven founder place incity cities inlet another views old wide creating used lead photography southern messages inscriptions mountains becoming posits grade augusta riddles public new especially new one resulting view established design landscape cityin theory edge accurately natural growth edge rationality oncehandsome avenue market suburbs sockets like words new wealthiest 
--------------------------------------------------------------------------------
DOC 2
landscapes association institutions kind rai