# Topic modeling

In [6]:
# !pip install orjson sqlitedict tomotopy nltk

In [7]:
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from tqdm.auto import tqdm
import orjson
import zlib
import tomotopy as tp
from sqlitedict import SqliteDict
import topicwizard
import random
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

In [8]:
# corpus
path_corpus=os.path.expanduser('~/ppa_data/solrcorpus2')
path_metadata = os.path.join(path_corpus, 'metadata.csv')
path_pages = os.path.join(path_corpus, 'corpus.sqlitedict')

In [9]:
# Read metadata
# df_metadata = pd.read_csv(path_metadata).fillna('').set_index('work_id')
# df_metadata

In [10]:
def encode_cache(x): return sqlite3.Binary(zlib.compress(orjson.dumps(x)))
def decode_cache(x): return orjson.loads(zlib.decompress(bytes(x)))
def get_pages_db():
    return SqliteDict(path_pages, flag='r', tablename='texts', encode=encode_cache, decode=decode_cache)
def get_meta_db():
    return SqliteDict(path_pages, flag='r', tablename='metadata', encode=encode_cache, decode=decode_cache)

In [87]:
from collections import Counter
CLUSTER_KEY='cluster_id_s'

def iter_pages(lim=None,min_num_words=None,max_pages_per_doc=None,max_pages_per_cluster=None, collections={}):
    num=0
    clustercounts=Counter()
    breaknow=False
    with get_pages_db() as db, get_meta_db() as mdb:
        for work_id in tqdm(list(db.keys()),desc='Iterating works',position=0):
            if breaknow: break

            meta = mdb[work_id]
            if collections and not set(meta['collections']) & set(collections):
                continue
            pages = db[work_id]
            cluster = meta.get(CLUSTER_KEY,work_id)

            if min_num_words:
                pages = [d for d in pages if len(d['page_tokens'])>=min_num_words]

            if max_pages_per_doc:
                random.shuffle(pages)
                pages=pages[:max_pages_per_doc]

            pbar2=tqdm(pages,desc='Iterating pages',position=1,disable=True)
            for page in pbar2:
                if not max_pages_per_cluster or clustercounts[cluster]<max_pages_per_cluster:
                    yield dict(
                        work_cluster = cluster,
                        **page
                    )
                    clustercounts[cluster]+=1
                    num+=1
                    if lim and num>=lim:
                        breaknow=True
                        break
            pbar2.close()

def iter_corpus(lim=None,max_pages_per_doc=25,**kwargs):
    yield from iter_pages(lim=lim,min_num_words=25,collections={'Literary','Linguistic'},max_pages_per_doc=max_pages_per_doc,**kwargs)

def iter_sample(lim=None):
    yield from iter_corpus(lim=lim, max_pages_per_cluster=25, max_pages_per_doc=25)

# next(iter_pages(collections=['Linguistic']))
# for x in iter_pages(max_pages_per_cluster=1): pass
# for i,x in enumerate(iter_corpus()): pass
# i
# next(iter_corpus())

In [88]:
import nltk
from nltk.corpus import stopwords as stops
stopwords = set(stops.words('english'))
def clean_toks(toks):
    return [tok for tok in toks if len(tok)>3 and tok not in stopwords]

In [95]:
def topic_model(ntopic=50, force=False, niter=100):
    fn=f'data.tomotopy.model.ntopic={ntopic}.bin'
    fnindex=fn+'.index.json'
    if force or not os.path.exists(fn) or not os.path.exists(fnindex):
        mdl = tp.LDAModel(k=50)
        docd={}
        for page in iter_sample():
            toks = clean_toks(page['page_tokens'])
            docd[page['page_id']] = mdl.add_doc(toks)

        def getdesc():
            return f'Training model (ndocs={len(docd)}, log-likelihood = {mdl.ll_per_word:.4})')
        pbar=tqdm(list(range(0, niter, 10)),desc=getdesc(),position=0)
        for i in pbar:
            pbar.set_description(getdesc())
            mdl.train(10)
        mdl.save(fn)
        with open(fnindex,'wb') as of:
            of.write(orjson.dumps(docd))
    else:
        mdl = tp.LDAModel.load(fn)
        with open(fnindex,'rb') as f:
            docd=orjson.loads(f.read())

    mdl.summary()
    return mdl,docd

In [96]:
mdl,docd = topic_model(force=False)

Iterating works: 100%|██████████| 6319/6319 [03:14<00:00, 32.46it/s]
Training model (ndocs=102248, log-likelihood = -10.4): 100%|██████████| 10/10 [02:29<00:00, 14.90s/it]


<Basic Info>
| LDAModel (current version: 0.12.5)
| 102993 docs, 14788350 words
| Total Vocabs: 1218037, Used Vocabs: 1218037
| Entropy of words: 10.02966
| Entropy of term-weighted words: 10.02966
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -10.38120
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 50 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 3097823306 (random seed)
| trained in version 0.12.5
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)


In [97]:
docd_test={}

In [69]:
num_pages = sum(1 for _ in iter_corpus())

Iterating works: 100%|██████████| 6319/6319 [03:09<00:00, 33.32it/s]


In [70]:
# other docs
o=[]
for page in tqdm(iter_sample(),position=2,desc='Iterating all pages', total=num_pages):
    pid = page['page_id']
    if pid in docd_test:
        doc = docd_test[pid]
    elif pid not in docd:
        docd_test[pid] = doc = mdl.make_doc(clean_toks(page['page_tokens']))
        mdl.infer(doc)
    else:
        doc = mdl.docs[docd[pid]]
    o.append(pd.Series(doc.get_topic_dist(), name=pid))
odf=pd.DataFrame(o).rename_axis('page_id')

Iterating pages:   0%|          | 128/152677 [01:12<24:02:43,  1.76it/s]
Iterating works:   0%|          | 6/6319 [01:12<21:11:44, 12.09s/it]


KeyboardInterrupt: 

In [35]:
odf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
uc1.$b305400_iii,0.000228,0.000153,0.006725,0.000124,0.282014,0.000486,0.001371,0.000698,0.000856,0.000194,...,0.001732,0.00015,0.004875,0.000163,0.000159,0.003068,0.000161,0.000534,0.000806,0.000478
uc1.$b305400_v,9.1e-05,6.1e-05,0.114077,4.9e-05,0.009701,0.000193,0.000543,0.018845,0.000339,7.7e-05,...,0.000687,5.9e-05,0.001933,6.5e-05,6.3e-05,0.001217,6.4e-05,0.000212,0.00032,0.00019
uc1.$b305400_vi,6.4e-05,4.3e-05,0.054625,3.5e-05,0.000296,0.000137,0.000386,0.000197,0.000241,5.5e-05,...,0.000488,4.2e-05,0.067287,4.6e-05,4.5e-05,0.000864,4.5e-05,0.00015,0.000227,0.000135
uc1.$b305400_vii,6.2e-05,4.2e-05,0.059264,3.4e-05,0.006668,0.000132,0.000374,0.00019,0.000233,5.3e-05,...,0.000472,4.1e-05,0.001329,4.4e-05,4.3e-05,0.000836,4.4e-05,0.000146,0.00022,0.00013
uc1.$b305400_viii,0.000105,7.1e-05,0.110961,5.7e-05,0.000484,0.000224,0.000631,0.000322,0.000394,8.9e-05,...,0.011584,6.9e-05,0.002246,7.5e-05,7.3e-05,0.001413,7.4e-05,0.000246,0.000371,0.00022


In [32]:
doc.get_topic_dist()

array([0.00114124, 0.00199177, 0.00821502, 0.03095148, 0.01164062,
       0.00874124, 0.00112182, 0.01039761, 0.00265881, 0.00264824,
       0.00590874, 0.01194252, 0.00747224, 0.00148831, 0.0172281 ,
       0.00588976, 0.01296221, 0.00131755, 0.01865524, 0.00094265,
       0.00167942, 0.00830613, 0.00193604, 0.00270271, 0.0010566 ,
       0.00086996, 0.00100074, 0.00289788, 0.00132043, 0.00145429,
       0.00621457, 0.00326153, 0.00111377, 0.00592177, 0.00106186,
       0.73067397, 0.00122145, 0.01505502, 0.00167707, 0.00206258,
       0.00118021, 0.00091567, 0.00350079, 0.00093837, 0.00772377,
       0.01788062, 0.00345174, 0.00479487, 0.00398178, 0.00082921],
      dtype=float32)

In [None]:
doc=mdl.docs[0]

In [None]:
doc.get_topics()

[(33, 0.8941167593002319),
 (46, 0.03659585118293762),
 (47, 0.007205671165138483),
 (15, 0.005247470922768116),
 (13, 0.0050822049379348755),
 (28, 0.004184012766927481),
 (4, 0.0032977887894958258),
 (25, 0.0030990480445325375),
 (9, 0.002993296831846237),
 (2, 0.0025174636393785477)]