In [1]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(
    remove=('headers', 'footers', 'quotes')
)
documents = data.data


In [2]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = [w for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

clean_docs = [clean_text(doc) for doc in documents]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer(max_df=0.95, min_df=10)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10)

X_count = count_vectorizer.fit_transform(clean_docs)
X_tfidf = tfidf_vectorizer.fit_transform(clean_docs)


In [4]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    n_components=10,
    random_state=42,
    learning_method='batch'
)

lda_topics = lda.fit_transform(X_count)


In [5]:
from sklearn.decomposition import NMF

nmf = NMF(
    n_components=10,
    random_state=42,
    init='nndsvd'
)

nmf_topics = nmf.fit_transform(X_tfidf)


In [6]:
import pyLDAvis
import numpy as np

# Topic-word distribution
topic_term_dists = lda.components_
topic_term_dists = topic_term_dists / topic_term_dists.sum(axis=1)[:, None]

# Document-topic distribution
doc_topic_dists = lda.transform(X_count)

# Document lengths
doc_lengths = np.array(X_count.sum(axis=1)).flatten()

# Vocabulary
vocab = count_vectorizer.get_feature_names_out()

# Term frequencies
term_frequency = np.array(X_count.sum(axis=0)).flatten()


lda_vis = pyLDAvis.prepare(
    topic_term_dists=topic_term_dists,
    doc_topic_dists=doc_topic_dists,
    doc_lengths=doc_lengths,
    vocab=vocab,
    term_frequency=term_frequency
)
pyLDAvis.enable_notebook()
lda_vis

pyLDAvis.save_html(lda_vis, "lda.html")


In [7]:
import numpy as np


def extract_top_words(model, feature_names, top_n=10):
    topics = []
    for topic in model.components_:
        top_ids = topic.argsort()[-top_n:]
        topics.append([feature_names[i] for i in top_ids])
    return topics


def pmi_coherence(topics, X, vocab):
    """
    PMI-based topic coherence
    Works with sparse CSR matrices
    """
    word2id = {w: i for i, w in enumerate(vocab)}
    X_bin = (X > 0).astype(np.int8)   # keep sparse

    topic_scores = []

    for topic in topics:
        ids = [word2id[w] for w in topic if w in word2id]
        score = 0
        pairs = 0

        for i in range(len(ids)):
            wi = ids[i]
            col_i = X_bin[:, wi].toarray().ravel()

            p_i = col_i.mean()

            for j in range(i + 1, len(ids)):
                wj = ids[j]
                col_j = X_bin[:, wj].toarray().ravel()

                p_j = col_j.mean()
                p_ij = (col_i & col_j).mean()

                if p_ij > 0:
                    score += np.log(p_ij / (p_i * p_j))
                    pairs += 1

        if pairs > 0:
            topic_scores.append(score / pairs)

    return float(np.mean(topic_scores))


def topic_diversity(topics):
    unique_words = set()
    total_words = 0
    for topic in topics:
        unique_words.update(topic)
        total_words += len(topic)
    return len(unique_words) / total_words



lda_topics_words = extract_top_words(
    lda,
    count_vectorizer.get_feature_names_out(),
    top_n=10
)

nmf_topics_words = extract_top_words(
    nmf,
    tfidf_vectorizer.get_feature_names_out(),
    top_n=10
)


lda_pmi = pmi_coherence(
    lda_topics_words,
    X_count,
    count_vectorizer.get_feature_names_out()
)

nmf_pmi = pmi_coherence(
    nmf_topics_words,
    X_count,
    count_vectorizer.get_feature_names_out()
)

lda_diversity = topic_diversity(lda_topics_words)
nmf_diversity = topic_diversity(nmf_topics_words)

lda_perplexity = lda.perplexity(X_count)


print("ðŸ“Š TOPIC MODEL EVALUATION (NO gensim)\n")

print("LDA:")
print(f"  PMI Coherence   : {lda_pmi:.4f}")
print(f"  Topic Diversity : {lda_diversity:.4f}")
print(f"  Perplexity      : {lda_perplexity:.2f}\n")

print("NMF:")
print(f"  PMI Coherence   : {nmf_pmi:.4f}")
print(f"  Topic Diversity : {nmf_diversity:.4f}")


ðŸ“Š TOPIC MODEL EVALUATION (NO gensim)

LDA:
  PMI Coherence   : 1.6853
  Topic Diversity : 0.7700
  Perplexity      : 2546.13

NMF:
  PMI Coherence   : 1.9921
  Topic Diversity : 0.9500


In [8]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Use a light, fast transformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    n_gram_range=(1, 2),
    min_topic_size=20,
    verbose=True
)

# Fit model
topics, probs = topic_model.fit_transform(clean_docs)

# Show topic info
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))


  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 103/103 [00:00<00:00, 240.14it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
2026-02-18 11:59:15,939 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 354/354 [06:27<00:00,  1.09s/it]
2026-02-18 12:05:44,814 - BERTopic - Embedding - Completed âœ“
2026-02-18 12:05:44,817 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-18 12:06:33,474 - BERTopic - Dimensionality - Completed âœ“
2026-02-18 12:06:33,479 - BERTopic - Cluster - Start clus

   Topic  Count                           Name  \
0     -1   3400          -1_would_one_edu_like   
1      0   1074        0_team_game_season_year   
2      1    940            1_car_bike_cars_dod   
3      2    842          2_god_jesus_one_bible   
4      3    518  3_health_medical_patients_msg   
5      4    421      4_space_nasa_launch_orbit   
6      5    405  5_key_encryption_chip_clipper   
7      6    265     6_israel_israeli_jews_arab   
8      7    264       7_card_monitor_video_vga   
9      8    184       8_scsi_drive_disk_drives   

                                      Representation  \
0  [would, one, edu, like, know, get, use, people...   
1  [team, game, season, year, play, games, hockey...   
2  [car, bike, cars, dod, engine, like, one, get,...   
3  [god, jesus, one, bible, believe, people, fait...   
4  [health, medical, patients, msg, disease, food...   
5  [space, nasa, launch, orbit, lunar, earth, sat...   
6  [key, encryption, chip, clipper, privacy, entr...   
7

In [9]:
topic_model.get_topic(1)


[('car', 0.01700741107839629),
 ('bike', 0.013116913636620707),
 ('cars', 0.008516138913604806),
 ('dod', 0.007549254834004425),
 ('engine', 0.007322223222280939),
 ('like', 0.006502844766136577),
 ('one', 0.006335674991584787),
 ('get', 0.005832513503028948),
 ('would', 0.005747821907652425),
 ('good', 0.005715622324327552)]

In [10]:
# topic_model.get_document_info(clean_docs[:5])


In [11]:
len(set(topics)) - (1 if -1 in topics else 0)
bertopic_topics = [
    [word for word, _ in topic_model.get_topic(t)]
    for t in topic_model.get_topics()
    if t != -1
]

bertopic_diversity = topic_diversity(bertopic_topics)
print("BERTopic Diversity:", bertopic_diversity)


BERTopic Diversity: 0.7918032786885246


In [12]:
fig = topic_model.visualize_topics()
fig.write_html("bertopic_topics.html")


In [13]:
fig = topic_model.visualize_hierarchy()
fig.write_html("bertopic_hierarchy.html")


In [14]:
fig = topic_model.visualize_barchart()
fig.write_html("bertopic_barchart.html")


In [None]:
# doc_info = topic_model.get_document_info(clean_docs)
# doc_info.to_csv("bertopic_document_topics.csv", index=False)
