# Topic modeling

In [2]:
# !pip install orjson topic-wizard sqlitedict

In [3]:
import os
import pandas as pd
from tqdm.auto import tqdm
import orjson
import zlib
from sqlitedict import SqliteDict
import topicwizard
import random
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# corpus
path_corpus=os.path.expanduser('~/ppa_data/solrcorpus2')
path_metadata = os.path.join(path_corpus, 'metadata.csv')
path_pages = os.path.join(path_corpus, 'corpus.sqlitedict')

In [5]:
# Read metadata
# df_metadata = pd.read_csv(path_metadata).fillna('').set_index('work_id')
# df_metadata

In [8]:
def encode_cache(x): return sqlite3.Binary(zlib.compress(orjson.dumps(x)))
def decode_cache(x): return orjson.loads(zlib.decompress(bytes(x)))
def get_pages_db():
    return SqliteDict(path_pages, flag='r', tablename='texts', encode=encode_cache, decode=decode_cache)
def get_meta_db():
    return SqliteDict(path_pages, flag='r', tablename='metadata', encode=encode_cache, decode=decode_cache)

In [15]:
%%timeit
for x in get_meta_db().items(): pass

21.9 ms ± 1.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
# for x in get_pages_db().items(): pass
# x

In [37]:
from collections import Counter
CLUSTER_KEY='cluster_id_s'

def iter_pages(lim=None,min_num_words=None,max_pages_per_doc=None,max_pages_per_cluster=None):
    num=0
    clustercounts=Counter()
    breaknow=False
    with get_pages_db() as db, get_meta_db() as mdb:
        for work_id in tqdm(list(db.keys()),desc='Iterating files'):
            if breaknow: break

            meta = mdb[work_id]
            pages = db[work_id]
            cluster = meta.get(CLUSTER_KEY,work_id)

            if min_num_words:
                pages = [d for d in pages if len(d['page_tokens'])>=min_num_words]

            if max_pages_per_doc:
                random.shuffle(pages)
                pages=pages[:max_pages_per_doc]

            for page in pages:
                if not max_pages_per_cluster or clustercounts[cluster]<max_pages_per_cluster:
                    yield dict(
                        work_cluster = cluster,
                        **page
                    )
                    clustercounts[cluster]+=1
                    num+=1
                    if lim and num>=lim:
                        breaknow=True
                        break

def iter_corpus():
    yield from iter_pages(min_num_words=25, max_pages_per_cluster=25)

# next(iter_pages())
# for x in iter_pages(max_pages_per_cluster=1): pass
for i,x in enumerate(iter_corpus()): pass
i
# next(iter_corpus())

Iterating files: 100%|██████████| 3983/3983 [02:39<00:00, 24.94it/s]


72427

In [41]:
import pickle

def get_corpus(force=False):
    fn='data.topicwizard.corpus.pkl'
    if not force and os.path.exists(fn):
        with open(fn,'rb') as f: 
            return pickle.load(f)

    corpus = []
    docids = []
    clusterids = []
    for d in iter_corpus():
        if random.random() > .75:
            corpus.append(d['page_text_clean'])
            docids.append(d['page_id'])
            clusterids.append(d['work_cluster'])

    fn='data.topicwizard.corpus.pkl'
    pkg=(corpus,docids,clusterids)
    with open(fn,'wb') as of:
        pickle.dump(pkg, of)
    
    return pkg


In [42]:
corpus,docids,clusterids = get_corpus(force=True)
len(corpus)

Iterating files: 100%|██████████| 4729/4729 [03:03<00:00, 25.78it/s]


20642

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from topicwizard.pipeline import make_topic_pipeline

vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
model = NMF(n_components=50)
pipeline = make_topic_pipeline(vectorizer, model, pandas_out=True)

In [44]:
pipeline.fit(corpus)

In [45]:
res=topicwizard.visualize(
    corpus, 
    pipeline=pipeline,
    document_names=docids,
    group_labels=clusterids
)

Preprocessing



divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



In [None]:
from topicwizard.figures import topic_barcharts,word_map

# topic_barcharts(corpus, pipeline=pipeline, top_n=5)
word_map(corpus, pipeline=pipeline)