# Topic modeling

In [None]:
# !pip install -U pip wheel
# !pip install -U topic-wizard tqdm pandas orjson

In [None]:
import os
import pandas as pd
from tqdm import tqdm
import orjson
import topicwizard
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

In [None]:
# corpus
path_corpus=os.path.expanduser('~/ppa_data/solrcorpus')
path_metadata = os.path.join(path_corpus, 'metadata.csv')
path_minimal = os.path.join(path_corpus, 'minimal.jsonl')
path_texts = os.path.join(path_corpus, 'texts')
path_minimal_numlines=None

In [None]:
# Read metadata
df_metadata = pd.read_csv(path_metadata).fillna('').set_index('work_id')
# df_metadata

In [None]:
# def iter_id_tokens(min_word_len=4, min_num_words=25):
#     global path_minimal_numlines
#     if path_minimal_numlines == None:
#         with open(path_minimal) as f:
#             path_minimal_numlines = sum(1 for line in tqdm(f,desc='Getting number of lines',position=0))

#     with open(path_minimal) as f:
#         for ln in tqdm(f,total=path_minimal_numlines,desc='Iterating over jsonl',position=0):
#             try:
#                 d=orjson.loads(ln)
#             except Exception:
#                 continue
#             toks = [
#                 x 
#                 for x in d['toks'] 
#                 if not min_word_len or len(x)>=min_word_len
#             ]
#             if not min_num_words or len(toks)>=min_num_words:
#                 yield (d['id'], toks)


# def iter_tokens(lim=None,**kwargs):
#     for i,(id,toks) in enumerate(iter_id_tokens(**kwargs)):
#         yield toks
#         if lim and i+1>=lim: break

# def iter_tokens_txt(**kwargs):
#     for x in iter_tokens(**kwargs):
#         yield ', '.join(x)

# # next(iter_id_tokens())
# # next(iter_tokens())
# # list(iter_tokens(1))
# next(iter_tokens_txt())

In [None]:
import gzip,random

def iter_pages(lim=None,min_num_words=25,max_pages_per_doc=1):
    num=0
    for work_id in tqdm(df_metadata.index,desc='Iterating files'):
        fn = os.path.join(path_texts,work_id+'.json.gz')
        if not os.path.exists(fn): continue
        with gzip.open(fn,'rt') as f:
            data = orjson.loads(f.read())
    
        odata=[]
        for paged in data:
            if not min_num_words or len(paged.get('page_tokens',[]))>=min_num_words:
                odata.append(paged)
        
        if max_pages_per_doc:
            random.shuffle(odata)
            odata=odata[:max_pages_per_doc]

        yield from odata
        
        num+=len(odata)
        if lim and num>=lim: break

def iter_pages_text(**kwargs):
    yield from (d.get('page_text_clean','') for d in iter_pages(**kwargs))

In [None]:
import pickle

def get_corpus(force=False):
    fn='data.topicwizard.corpus.pkl'
    if not force and os.path.exists(fn):
        with open(fn,'rb') as f: 
            return pickle.load(f)
    
    def iter_corpus(): 
        yield from iter_pages()
    corpus = []
    docids = []
    clusterids = []
    for d in iter_corpus():
        corpus.append(d['page_text_clean'])
        docids.append(d['page_id'])
        clusterids.append(d['work_cluster'])

    fn='data.topicwizard.corpus.pkl'
    pkg=(corpus,docids,clusterids)
    with open(fn,'wb') as of:
        pickle.dump(pkg, of)
    
    return pkg


In [None]:
corpus,docids,clusterids = get_corpus(force=True)
len(corpus)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from topicwizard.pipeline import make_topic_pipeline

vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words="english")
model = NMF(n_components=50)
pipeline = make_topic_pipeline(vectorizer, model, pandas_out=True)

In [None]:
pipeline.fit(corpus)

In [None]:
res=topicwizard.visualize(
    corpus, 
    pipeline=pipeline,
    document_names=docids,
    group_labels=clusterids
)


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



: 

In [None]:
from topicwizard.figures import topic_barcharts,word_map

# topic_barcharts(corpus, pipeline=pipeline, top_n=5)
word_map(corpus, pipeline=pipeline)