# Topic modeling

In [None]:
# !pip install -U orjson sqlitedict tomotopy nltk pyLDAvis altair ipywidgets

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from tqdm.auto import tqdm
import orjson
import zlib
import tomotopy as tp
from sqlitedict import SqliteDict
import topicwizard
import random
import pyLDAvis
from collections import Counter
import nltk
from nltk.corpus import stopwords as stops
import numpy as np

In [None]:
# corpus
path_corpus=os.path.expanduser('~/ppa_data/solrcorpus2')
path_metadata = os.path.join(path_corpus, 'metadata.csv')
path_pages = os.path.join(path_corpus, 'corpus.sqlitedict')

In [None]:
# Read metadata
# df_metadata = pd.read_csv(path_metadata).fillna('').set_index('work_id')
# df_metadata

In [None]:
def encode_cache(x): return sqlite3.Binary(zlib.compress(orjson.dumps(x)))
def decode_cache(x): return orjson.loads(zlib.decompress(bytes(x)))
def get_pages_db():
    return SqliteDict(path_pages, flag='r', tablename='texts', encode=encode_cache, decode=decode_cache)
def get_meta_db():
    return SqliteDict(path_pages, flag='r', tablename='metadata', encode=encode_cache, decode=decode_cache)

In [None]:
CLUSTER_KEY='cluster_id_s'

def iter_pages(lim=None,min_num_words=None,max_pages_per_doc=None,max_pages_per_cluster=None, collections={}):
    num=0
    clustercounts=Counter()
    breaknow=False
    with get_pages_db() as db, get_meta_db() as mdb:
        for work_id in tqdm(list(db.keys()),desc='Iterating works',position=0):
            if breaknow: break

            meta = mdb[work_id]
            if collections and not set(meta['collections']) & set(collections):
                continue
            pages = db[work_id]
            cluster = meta.get(CLUSTER_KEY,work_id)

            if min_num_words:
                pages = [d for d in pages if len(d['page_tokens'])>=min_num_words]

            if max_pages_per_doc:
                random.shuffle(pages)
                pages=pages[:max_pages_per_doc]

            pbar2=tqdm(pages,desc='Iterating pages',position=1,disable=True)
            for page in pbar2:
                if not max_pages_per_cluster or clustercounts[cluster]<max_pages_per_cluster:
                    yield dict(
                        work_cluster = cluster,
                        **page
                    )
                    clustercounts[cluster]+=1
                    num+=1
                    if lim and num>=lim:
                        breaknow=True
                        break
            pbar2.close()

def iter_corpus(lim=None,max_pages_per_doc=25,**kwargs):
    yield from iter_pages(lim=lim,min_num_words=25,collections={'Literary','Linguistic'},max_pages_per_doc=max_pages_per_doc,**kwargs)

def iter_sample(lim=None):
    yield from iter_corpus(lim=lim, max_pages_per_cluster=25, max_pages_per_doc=25)

# next(iter_pages(collections=['Linguistic']))
# for x in iter_pages(max_pages_per_cluster=1): pass
# for i,x in enumerate(iter_corpus()): pass
# i
# next(iter_corpus())

In [None]:

stopwords = set(stops.words('english'))
def clean_toks(toks):
    return [tok for tok in toks if len(tok)>3 and tok not in stopwords]

In [None]:
def topic_model(ntopic=50, force=False, niter=100):
    fn=f'data.tomotopy.model.ntopic={ntopic}.bin'
    fnindex=fn+'.index.json'
    if force or not os.path.exists(fn) or not os.path.exists(fnindex):
        mdl = tp.LDAModel(k=50)
        docd={}
        for page in iter_sample():
            toks = clean_toks(page['page_tokens'])
            docd[page['page_id']] = mdl.add_doc(toks)

        def getdesc():
            return f'Training model (ndocs={len(docd)}, log-likelihood = {mdl.ll_per_word:.4})'
        pbar=tqdm(list(range(0, niter, 10)),desc=getdesc(),position=0)
        for i in pbar:
            pbar.set_description(getdesc())
            mdl.train(10)
        mdl.save(fn)
        with open(fnindex,'wb') as of:
            of.write(orjson.dumps(docd))
    else:
        print(f'Loading model: {fn}')
        mdl = tp.LDAModel.load(fn)
        print(f'Loading model index: {fnindex}')
        with open(fnindex,'rb') as f:
            docd=orjson.loads(f.read())

    mdl.summary()
    return mdl,docd

In [None]:
mdl,docd = topic_model(force=False)

In [None]:
def save_pyldavis():
    print('Calculating topic_term_dists')
    topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])


    print('Calculating doc_topic_dists')
    doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
    doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)

    print('Calculating doc_lengths')
    doc_lengths = np.array([len(doc.words) for doc in mdl.docs])


    print('Calculating vocab')
    vocab = list(mdl.used_vocabs)
    term_frequency = mdl.used_vocab_freq

    print('preparing data')
    prepared_data = pyLDAvis.prepare(
        topic_term_dists, 
        doc_topic_dists, 
        doc_lengths, 
        vocab, 
        term_frequency,
        start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1
        sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching!
    )

    print('saving html')
    pyLDAvis.save_html(prepared_data, 'ldavis.html')

In [None]:
!open ldavis.html

In [None]:
doc_topic_dists

In [None]:
def get_dfclust():
    id2doc={v:k for k,v in docd.items()}
    doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
    doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
    index,values = zip(*[(id2doc[i],x) for i,x in enumerate(doc_topic_dists) if i in id2doc])
    dftopicdist = pd.DataFrame(values, index=index)
    with get_meta_db() as mdb:
        dfmeta = pd.DataFrame({'work_id':wid, **mdb[wid]} for wid in tqdm(mdb, total=len(mdb), position=0, desc='Gathering metadata')).set_index('work_id')
    w2c = dict(zip(dfmeta.index, dfmeta[CLUSTER_KEY]))
    dftopicdist['work_id']=[i.split('_')[0] for i in dftopicdist.index]
    dftopicdist['cluster']=[w2c.get(work_id,work_id) for work_id in dftopicdist.work_id]
    dfclust_avgs=dftopicdist.groupby('cluster').mean(numeric_only=True)
    dfclust_meta = dfmeta.drop_duplicates(CLUSTER_KEY).set_index(CLUSTER_KEY)
    return dfclust_meta.join(dfclust_avgs)

In [None]:
dfclust = get_dfclust()

In [None]:
import altair as alt
from ipywidgets import interactive, interact, interact_manual, HBox
from functools import cache

tnums=list(range(mdl.k))
topicwords_d = {tnum:', '.join([i for i,j in mdl.get_topic_words(tnum)]) for tnum in tnums}
topicnames = [f'{tnum}: {topicwords_d[tnum]}' for tnum in tnums]

def get_topic_name(tnum):
    return topicnames[tnum]

def get_wordcloud(tnum):
    wc = WordCloud(background_color='white', width=800, height=400)
    wordcloud = wc.generate_from_frequencies(dict(mdl.get_topic_words(tnum, top_n=100)))
    return wordcloud

@cache
def get_figdf(tnum):
    collections={'Linguistic','Literary'}
    figdf=dfclust.reset_index()[[CLUSTER_KEY,'title','author','pub_date','publisher','pub_place','source_url', 'collections', tnum]]
    figdf['collections']=[[x for x in c if x in collections] for c in figdf.collections]
    figdf['collections'] = figdf['collections'].apply(lambda x: 'Linguistic' if 'Linguistic' in set(x) else (x[0] if x else x))
    figdf=figdf[figdf.collections.apply(bool)]
    figdf.columns = ['cluster', 'title', 'author', 'date', 'publisher', 'pubplace', 'source', 'genre', 'topic']
    figdf = figdf[1700<=figdf.date]
    return figdf

In [None]:
get_topic_name(0)

In [None]:
# %%timeit
# get_figdf(0)

In [None]:
# @interact(tname=topicnames)
def plot_topic(tname):
    tnum=int(tname.split(':')[0])
    figdf=get_figdf(tnum)
    topicwords = topicwords_d.get(tnum)
    wordcloud=get_wordcloud(tnum)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")

    
    fig = alt.Chart(figdf).mark_circle(size=60).encode(
        x = alt.X('date', scale=alt.Scale(domain=[1700, 1920])),
        y='topic',
        color='genre',
        tooltip=figdf.columns.tolist()
    ).interactive(
    ).properties(
        width=800,
        height=400,
    # ).facet(
    #     facet='genre:N',
    #     columns=2,
    # ).resolve_scale(
    #     y='independent'
    ).properties(
        title = f'Topic {tnum}: {topicwords}'
    )
    return fig 

In [None]:
# !pip install plotnine
import plotnine as p9
p9.options.figure_size=(8,4)

def plot_topic_min(tnum):
    figdf=get_figdf(tnum)
    figdf['period']=figdf['date'].apply(str)
    figdf=figdf.groupby(['genre','period']).median(numeric_only=True).reset_index()
    figdf=pd.concat(
        gdf.assign(topic=gdf.topic.rolling(10).mean())
        for g,gdf in figdf.groupby('genre')
    )
    fig=p9.ggplot(figdf, p9.aes(x='date',y='topic',color='genre'))
    fig+=p9.geom_point()
    fig+=p9.geom_smooth(method='loess')
    fig+=p9.labs(
        title=get_topic_name(tnum),
        x='Date of publication',
        y='Prevalence of Topic'
    )
    fig+=p9.theme_classic()
    odir='timeplots'
    os.makedirs(odir,exist_ok=True)
    fig.save(f'{odir}/fig.timeplot.tnum={tnum}.png')
    return fig

In [None]:
# !pip install scikit-misc

In [None]:
# plot_topic_min(0)

In [None]:
# for tnum in range(mdl.k): plot_topic_min(tnum)

In [None]:
from string import punctuation

def get_cluster_name(clustid):
    meta = dict(dfclust_meta.loc[clustid])
    return f'{meta["title"].strip(punctuation)[:50]} ({str(meta["pub_date"])[:4]}) [{meta["source_url"]}]'

In [None]:
get_cluster_name('mdp.39015050663247')

In [None]:
def save_wordclouds(odir='wordclouds'):
    my_dpi=75
    os.makedirs(odir, exist_ok=True)
    for tnum in tqdm(list(range(mdl.k)), desc='Saving wordclouds'):
        wordcloud=get_wordcloud(tnum)
        plt.box(False)
        plt.figure(figsize=(800/my_dpi, 400/my_dpi), dpi=my_dpi)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.savefig(f'{odir}/fig.wordcloud.tnum={tnum}.png')

In [None]:
# save_wordclouds()

In [None]:
def geturl(tnum):
    return f'https://raw.githubusercontent.com/Princeton-CDH/ppa-nlp/develop/notebooks/wordclouds/fig.wordcloud.tnum%3D{tnum}.png'

def geturl2(tnum):
    return f'https://raw.githubusercontent.com/Princeton-CDH/ppa-nlp/develop/notebooks/timeplots/fig.timeplot.tnum%3D{tnum}.png'


def get_topic_info_df():
    tld=[]
    for tnum in tqdm(list(range(mdl.k)), desc='Gathering info on topics'):
        td={
            'Topic':tnum,
            'Topic Name':'',
            'Top Words':', '.join([i for i,j in mdl.get_topic_words(tnum, top_n=50)]),
            'Top Documents':'* '+('\n* '.join(get_cluster_name(c) for c in dfclust.sort_values(tnum,ascending=False).index[:5])),
            'Word Cloud':f'=IMAGE("{geturl(tnum)}")',
            'Historical Plot':f'=IMAGE("{geturl2(tnum)}")',
        }
        tld.append(td)
    tdf=pd.DataFrame(tld).set_index('Topic')
    return tdf

In [None]:
tdf=get_topic_info_df()

In [None]:
# !pip install openpyxl
tdf.to_excel('data.topic_info.xlsx')