In [18]:
import numpy as np
from utils import load_dataset_df, tokenize
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
dataset_name = "MIND15"
data_df, label_mapping = load_dataset_df(dataset_name)

In [7]:
docs_token = []
for doc in data_df["data"].values:
    docs_token.append(tokenize(doc, "aggressive"))

In [10]:
def print_avg_length(docs):
    print(f"Average tokens number is {np.mean([len(d) for d in docs])}")

In [11]:
print_avg_length(docs_token)

Average tokens number is 292.6440769116856


In [12]:
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize(docs):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(t) for t in d] for d in docs]

In [14]:
# Compute bigrams.
from gensim.models import Phrases

def add_bigram(docs, min_count=200):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=min_count)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

In [16]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

def filter_tokens(docs):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    return dictionary

In [17]:
filter_dict = filter_tokens(docs_token)

In [19]:
def get_bow_corpus(docs, dictionary):
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(d) for d in docs]
    return corpus

In [20]:
corpus_ori_filter = get_bow_corpus(docs_token, filter_dict)

In [22]:
print('Number of unique tokens: %d' % len(filter_dict))
print('Number of documents: %d' % len(corpus_ori_filter))

Number of unique tokens: 40732
Number of documents: 128303


In [25]:
from gensim.models import LdaModel

def lda_model(dictionary, corpus, **kwargs):
    temp = dictionary[0]  # This is only to "load" the dictionary.
    # Make an index to word dictionary.
    args = {
        "corpus": corpus, "id2word": dictionary.id2token, "chunksize": kwargs.get("chunksize", 2000), "alpha": kwargs.get("alpha", "auto"),
        "eta": kwargs.get("eta", "auto"), "iterations": kwargs.get("iterations", 400), "num_topics": kwargs.get("num_topics", 50),
        "passes": kwargs.get("passes", 10), "eval_every": kwargs.get("eval_every", None)
    }
    return LdaModel(**args)

In [27]:
model = lda_model(filter_dict, corpus_ori_filter, passes=1)

2022-01-10 19:39:01,421 : INFO : using autotuned alpha, starting with [0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
2022-01-10 19:39:01,427 : INFO : using serial LDA version on this node
2022-01-10 19:39:01,579 : INFO : running online (single-pass) LDA training, 50 topics, 1 passes over the supplied corpus of 128303 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2022-01-10 19:39:01,581 : INFO : PROGRESS: pass 0, at document #2000/128303
2022-01-10 19:39:49,384 : INFO : optimized alpha [0.01917195, 0.018086268, 0.018078016, 0.018714827, 0.019861076, 0.018927671, 0.017932687, 0.018770434, 0.01849621, 0.018287048, 0.018142182, 0.01784346

In [28]:
def evaluate_topics(model, corpus, docs, dictionary, num_topics=50, c_method="c_npmi"):
    top_topics = model.top_topics(corpus, texts=docs, dictionary=dictionary, coherence=c_method) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print(f'Average topic coherence({c_method}): %.4f.' % avg_topic_coherence)

    from pprint import pprint
    pprint(top_topics)

In [29]:
evaluate_topics(model, corpus_ori_filter, docs_token, filter_dict, c_method="c_npmi")

2022-01-10 19:49:06,256 : INFO : using ParallelWordOccurrenceAccumulator(processes=15, batch_size=64) to estimate probabilities from sliding windows
2022-01-10 19:49:18,588 : INFO : 1 batches submitted to accumulate stats from 64 documents (15060 virtual)
2022-01-10 19:49:18,593 : INFO : 2 batches submitted to accumulate stats from 128 documents (32877 virtual)
2022-01-10 19:49:18,601 : INFO : 3 batches submitted to accumulate stats from 192 documents (54317 virtual)
2022-01-10 19:49:18,607 : INFO : 4 batches submitted to accumulate stats from 256 documents (75315 virtual)
2022-01-10 19:49:18,613 : INFO : 5 batches submitted to accumulate stats from 320 documents (92705 virtual)
2022-01-10 19:49:18,618 : INFO : 6 batches submitted to accumulate stats from 384 documents (109733 virtual)
2022-01-10 19:49:18,624 : INFO : 7 batches submitted to accumulate stats from 448 documents (127344 virtual)
2022-01-10 19:49:18,632 : INFO : 8 batches submitted to accumulate stats from 512 documents (1

Average topic coherence(c_npmi): 0.0706.
[([(0.02254841, 'apartment'),
   (0.020544501, 'month'),
   (0.018669572, 'friendly'),
   (0.01791396, 'bedroom'),
   (0.017545605, 'dog'),
   (0.017285727, 'walk'),
   (0.016871285, 'square'),
   (0.016503913, 'pet'),
   (0.016172804, 'listing'),
   (0.015375353, 'bathroom'),
   (0.014019134, 'rental'),
   (0.013972232, 'rent'),
   (0.013307036, 'score'),
   (0.012793661, 'cat'),
   (0.012414647, 'building'),
   (0.012378498, 'two'),
   (0.012146839, 'listed'),
   (0.01181364, 'unit'),
   (0.011607622, 'foot'),
   (0.010799116, 'parking')],
  0.23033649446009072),
 ([(0.062069245, 'trump'),
   (0.036770403, 'president'),
   (0.02478208, 'house'),
   (0.021723244, 'impeachment'),
   (0.01662033, 'mr'),
   (0.015011989, 'ukraine'),
   (0.012154309, 'white'),
   (0.009534378, 'inquiry'),
   (0.008845726, 'former'),
   (0.008242067, 'donald'),
   (0.0073130983, 'u'),
   (0.006612912, 'call'),
   (0.0063002394, 'biden'),
   (0.006203639, 'democrats'