In [3]:
data_dir = "../../data/trec_covid_topic_modelling"
model_dir = "../../models/trec_covid_topic_modelling"

# LDA Model
First read data

In [4]:
import csv
import os
import sys
import gensim
import numpy as np
csv.field_size_limit(sys.maxsize)

f_tokenized_path = "/home/tfink/data/kodicare/trec-covid/TREC-COVID_sample_content.csv.tokenized.txt"
f_comp_tokenized_path = os.path.join(data_dir, "abcnews-date-text.csv.tokenized.txt")
num_topics = 30
passes = 40
iterations = 5000
minimum_probability = 0.05


def read_tokenized(path, batch_size = None):
    with open(path, "r") as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        batch = []
        #for line in tqdm(reader, desc="batch"):
        for line in reader:
            cord_uid, doc_text_tokenized = line
            doc_tokens = doc_text_tokenized.split(" ")
            if batch_size:
                batch.append(doc_tokens)
                if len(batch) == batch_size:
                    yield batch
                    batch = []
            else:
                yield doc_tokens
        
        if len(batch) > 0:
            yield batch

In [5]:
# TREC Covid docs
processed_docs = list(read_tokenized(f_tokenized_path, batch_size=None))
print(processed_docs[0][:10])

# comparison docs from the news domain
processed_docs_comp = list(read_tokenized(f_comp_tokenized_path, batch_size=None))
print(processed_docs_comp[0][:10])

['clinical', 'feature', 'culture', 'prove', 'mycoplasma', 'pneumoniae', 'infection', 'king', 'abdulaziz', 'university']
['decide', 'community', 'broadcasting', 'licence']


In [6]:
dictionary = gensim.corpora.Dictionary(processed_docs+processed_docs_comp)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
print(len(dictionary))
list(dictionary.items())[:10]

21272


[(0, '<NUM>'),
 (1, 'abscess'),
 (2, 'absence'),
 (3, 'access'),
 (4, 'accord'),
 (5, 'account'),
 (6, 'accounting'),
 (7, 'acquire'),
 (8, 'acute'),
 (9, 'additionally')]

In [7]:
from gensim import models

def create_corpus(docs):
    bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
    bow_doc_0 = bow_corpus[0]
    for i in range(len(bow_doc_0)):
        print("Word {} (\"{}\") appears {} time.".format(bow_doc_0[i][0], 
                                                dictionary[bow_doc_0[i][0]], 
    bow_doc_0[i][1]))

    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    return corpus_tfidf

corpus = create_corpus(processed_docs)
corpus_comp = create_corpus(processed_docs_comp)

Word 0 ("<NUM>") appears 134 time.
Word 1 ("abscess") appears 2 time.
Word 2 ("absence") appears 2 time.
Word 3 ("access") appears 2 time.
Word 4 ("accord") appears 4 time.
Word 5 ("account") appears 6 time.
Word 6 ("accounting") appears 2 time.
Word 7 ("acquire") appears 12 time.
Word 8 ("acute") appears 8 time.
Word 9 ("additionally") appears 2 time.
Word 10 ("adenovirus") appears 2 time.
Word 11 ("admission") appears 10 time.
Word 12 ("admit") appears 4 time.
Word 13 ("adult") appears 14 time.
Word 14 ("affect") appears 4 time.
Word 15 ("agar") appears 8 time.
Word 16 ("age") appears 8 time.
Word 17 ("agent") appears 2 time.
Word 18 ("airway") appears 4 time.
Word 19 ("amplification") appears 2 time.
Word 20 ("analysis") appears 2 time.
Word 21 ("and/or") appears 2 time.
Word 22 ("anemia") appears 4 time.
Word 23 ("annual") appears 2 time.
Word 24 ("antibiotic") appears 2 time.
Word 25 ("approach") appears 2 time.
Word 26 ("arabia") appears 11 time.
Word 27 ("area") appears 4 time.


In [8]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus, 
    num_topics=num_topics, 
    id2word=dictionary, 
    passes=passes, 
    iterations=iterations,
    minimum_probability=minimum_probability,
    random_state=2018,
    workers=4)

In [9]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"tregs" + 0.001*"heater" + 0.001*"nina" + 0.001*"lamp" + 0.001*"isothermal" + 0.001*"prize" + 0.001*"citation" + 0.001*"padre" + 0.001*"foreigner" + 0.001*"heating"
Topic: 1 Word: 0.004*"cat" + 0.001*"feline" + 0.001*"reassortant" + 0.001*"sofa" + 0.001*"gastroenteritis" + 0.001*"thai" + 0.001*"trout" + 0.001*"peru" + 0.001*"myocarditis" + 0.001*"seroprevalence"
Topic: 2 Word: 0.003*"ifnα" + 0.001*"cd14" + 0.001*"peptidase" + 0.001*"amazon" + 0.001*"pneumonitis" + 0.001*"poliovirus" + 0.001*"emerging" + 0.001*"weather" + 0.001*"gibbs" + 0.001*"il-4"
Topic: 3 Word: 0.003*"rabie" + 0.003*"tlr2" + 0.002*"tlr9" + 0.002*"myd88" + 0.002*"tlr" + 0.001*"ctl" + 0.001*"tlr7" + 0.001*"organizational" + 0.001*"adenosine" + 0.001*"ancestry"
Topic: 4 Word: 0.001*"zanamivir" + 0.001*"traveller" + 0.001*"sensor" + 0.001*"electrode" + 0.001*"il-1" + 0.001*"chat" + 0.001*"games" + 0.001*"tomato" + 0.000*"placebo" + 0.000*"destination"
Topic: 5 Word: 0.002*"ssdna" + 0.001*"integrin" 

In [10]:
topics = lda_model_tfidf.get_topics()
print(topics.shape)
print(topics[1,:])
print(max(topics[1,:]), "<-- should be the first word of topic 1 from above")
print(min(topics[1,:]))
print(sum(topics[1,:]))
print()
topic_terms = lda_model_tfidf.get_topic_terms(1, topn=5)
for term_id, prob in topic_terms:
    print(f"Term: {dictionary[term_id]}, Term Id: {term_id}, Prob: {prob:.2%}")

(30, 21272)
[4.6077959e-05 4.6077272e-05 4.6080637e-05 ... 4.6077272e-05 4.6077272e-05
 4.6077272e-05]
0.0035455448 <-- should be the first word of topic 1 from above
4.607727e-05
1.0000000487998477

Term: cat, Term Id: 7346, Prob: 0.35%
Term: feline, Term Id: 9064, Prob: 0.12%
Term: reassortant, Term Id: 8897, Prob: 0.11%
Term: sofa, Term Id: 9311, Prob: 0.10%
Term: gastroenteritis, Term Id: 6515, Prob: 0.09%


In [11]:
lda_model_tfidf_2 = gensim.models.LdaMulticore(
    corpus, 
    num_topics=num_topics, 
    id2word=dictionary, 
    passes=passes, 
    iterations=iterations,
    minimum_probability=minimum_probability,
    random_state=2023,
    workers=4)

In [None]:
for idx, topic in lda_model_tfidf_2.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
topics = lda_model_tfidf_2.get_topics()
for term_id, prob in topic_terms:
    max_other = max([topic_probs[term_id] for topic_probs in topics])
    print(f"Term: {dictionary[term_id]}, Prob: {prob:.2%}, Other Prob: {max_other:.2%}")

In [None]:
lda_model_tfidf_comp = gensim.models.LdaMulticore(
    corpus_comp, 
    num_topics=num_topics, 
    id2word=dictionary, 
    passes=passes, 
    iterations=iterations,
    minimum_probability=minimum_probability,
    random_state=2023,
    workers=4)

In [None]:
for idx, topic in lda_model_tfidf_2.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
lda_1_path = os.path.join(model_dir, "lda_1")
lda_2_path = os.path.join(model_dir, "lda_2")
lda_comp_path = os.path.join(model_dir, "lda_comp")

lda_model_tfidf.save(lda_1_path)
lda_model_tfidf_2.save(lda_2_path)
lda_model_tfidf_comp.save(lda_comp_path)