# Topic modeling hexameter corpus


In [8]:
import sys;sys.path.append('..')
from ppanlp import *
ppa = PPA()

In [24]:
url_hexameter_texts = 'https://docs.google.com/spreadsheets/d/e/' \
'2PACX-1vSMEiSg0lWf_r2HrNQtKHSe76Sbk-DSpb_93FDwSOmlP5sjwwsreWOXJWPJ6ruzJQ/pub?gid=906568184&single=true&output=csv'

In [26]:
from intspan import intspan

In [28]:
df_texts=pd.read_csv(url_hexameter_texts)
df_texts.columns = ['title', 'work_id', 'pages_orig', 'pages_digital','author']
df_texts['pages_digital']=df_texts['pages_digital'].apply(lambda x: '' if not x.strip()[0].isdigit() else x.strip())
df_texts

Unnamed: 0,title,work_id,pages_orig,pages_digital,author
0,Lectures on Rhetoric and Belles Lettres,hvd.hxjgck,79-106,87-113,"Blair, Hugh"
1,"The poetical Decameron, or, Ten conversations ...",nyp.33433074834932,77-145,131-199,"Collier, John Payne"
2,‘Review of Preface to A Vision of Judgement’,njp.32101076378536-p436,422-36,436-450,Anon
3,Historical and critical remarks upon the moder...,hvd.hnjimv,Whole text.,,"Tillbrook, Samuel"
4,"The Principles of Rhythm, both in Speech and M...",nyp.33433056659844,126-163,158-195,"Roe, Richard"
5,"The history of English poetry, from the close ...",hvd.hwdqtx,47-450,565-968,"Warton, Thomas"
6,"Prosodia graeca, or, An exposition of the Gree...",hvd.32044081367153,Whole text.,,"Dunbar, George"
7,"The beauties of modern literature, in verse an...",nyp.33433076078660,299-313,409- 423,"MacDermot, Martin"
8,A general critical grammar of the Inglish lang...,nnc1.cu58512900,225-247,261-283,"Oliver, Samuel"
9,The philosophy of the human voice,nyp.33433084113012,149-179,165-195,"Rush, James"


In [56]:
def iter_pages(self, df):
    id2pages = {
        id:set(intspan(prange)) if prange else prange
        for id,prange in zip(df.work_id, df.pages_digital)
    }
    for page in self.iter_pages():
        if not page.text.id in id2pages: continue
        pagerange = id2pages[page.text.id]
        pagenum = page.meta['page_num']
        if not pagerange or pagenum in pagerange:
            yield page

In [59]:
pages = list(iter_pages(ppa,df_texts))

[32m2023-12-08 08:02:32,027[0m [34m[1m| iterating pages by corpus jsonl file[0m
Iterating over pages.jsonl.gz: 100%|█████████████████████████████████████████████████████████| 2160441/2160441 [00:22<00:00, 94996.22it/s]
[32m2023-12-08 08:02:54,773[0m [34m[1m| 22.75 seconds[0m


In [60]:
len(pages)

2038

In [73]:
def topic_model(pages, ntopics=25, niter=100):
    mdl = tp.LDAModel(k=ntopics)
    docd={}
    for page in pages:
        docd[page.id] = mdl.add_doc(page.content_words)
    def getdesc():
        return f'Training model (ndocs={len(docd)}, log-likelihood = {mdl.ll_per_word:.4})'
    
    pbar=tqdm(list(range(0, niter, 1)),desc=getdesc(),position=0)
    for i in pbar:
        pbar.set_description(getdesc())
        mdl.train(1)
    return mdl,docd

In [74]:
mdl,docd = topic_model(pages)

Training model (ndocs=2038, log-likelihood = -10.03): 100%|█████████████████████████████████████████████| 100/100 [00:01<00:00, 60.05it/s]


In [75]:
mdl.summary(topic_word_top_n=10)

<Basic Info>
| LDAModel (current version: 0.12.5)
| 1987 docs, 321827 words
| Total Vocabs: 64104, Used Vocabs: 64104
| Entropy of words: 9.40720
| Entropy of term-weighted words: 9.40720
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 100, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -10.03290
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 25 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 2741382038 (random seed)
| trained in version 0.12.5
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.0831

In [77]:
import pyLDAvis
topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists, 
    doc_topic_dists, 
    doc_lengths, 
    vocab, 
    term_frequency,
    start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1
    sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching!
)

pyLDAvis.save_html(prepared_data, 'ldavis.html')

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [78]:
!open ./ldavis.html