In [1]:
!pip install sklearn




In [17]:
# IMPORTS

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from bertopic import BERTopic
import plotly.io as pio


import sys
!{sys.executable} -m pip install pyLDAvis==3.4.1

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

from bertopic import BERTopic





In [3]:
# LOAD DATA

dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = dataset.data


In [4]:
# VECTORIZATION
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=5)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)

count_data = count_vectorizer.fit_transform(documents)
tfidf_data = tfidf_vectorizer.fit_transform(documents)


In [None]:
# LDA MODEL

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(count_data)

lda_perplexity = lda.perplexity(count_data)
print("LDA Perplexity:", lda_perplexity)



In [None]:
# NMF MODEL

nmf = NMF(n_components=10, random_state=42)
nmf.fit(tfidf_data)

In [6]:
# DISPLAY TOP WORDS FUNCTION

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx}:")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("LDA Topics:")
display_topics(lda, count_vectorizer.get_feature_names_out(), 10)

print("\nNMF Topics:")
display_topics(nmf, tfidf_vectorizer.get_feature_names_out(), 10)


LDA Topics:

Topic 0:
cx w7 ah c_ mv uw t7 hz lk chz

Topic 1:
00 db 25 10 55 20 15 16 14 12

Topic 2:
key use drive like chip just bit card know scsi

Topic 3:
people god don just think like know say said time

Topic 4:
game team year games play season hockey players league 10

Topic 5:
edu com available information data use mail ftp list pub

Topic 6:
ax max g9v b8f a86 pl 145 1d9 0t 1t

Topic 7:
just don think know people like time right going mr

Topic 8:
file windows program use edu window dos files thanks using

Topic 9:
space new 10 1993 launch nasa 20 april sale research

NMF Topics:

Topic 0:
just don like think know good ve time car really

Topic 1:
windows file dos window files program use using version problem

Topic 2:
geb dsl chastity n3jxp cadre pitt shameful intellect skepticism surrender

Topic 3:
god jesus bible believe faith christ christian christians does say

Topic 4:
key chip encryption clipper keys escrow government algorithm use public

Topic 5:
drive scsi driv

In [7]:
# COHERENCE SCORE (Gensim)

texts = [doc.split() for doc in documents]
dictionary = Dictionary(texts)

coherence_model_lda = CoherenceModel(
    topics=[[count_vectorizer.get_feature_names_out()[i] 
             for i in topic.argsort()[:-11:-1]] 
            for topic in lda.components_],
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)

lda_coherence = coherence_model_lda.get_coherence()

print("\nLDA Coherence:", lda_coherence)



LDA Coherence: 0.42408818054650244


In [10]:

# pyLDAvis Visualization


panel = pyLDAvis.lda_model.prepare(
    lda, 
    count_data, 
    count_vectorizer
)
panel



PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
3     -0.148567 -0.058469       1        1  26.286316
7     -0.162439 -0.065219       2        1  12.510721
2     -0.151637 -0.098160       3        1  12.218471
8     -0.121235 -0.094309       4        1  10.088902
5     -0.115910 -0.045381       5        1   9.133362
6      0.436384 -0.220462       6        1   8.993394
4     -0.035466  0.149397       7        1   7.069097
9     -0.093287  0.050342       8        1   6.833751
1      0.064314  0.265549       9        1   3.728206
0      0.327843  0.116712      10        1   3.137781, topic_info=        Term          Freq         Total Category  logprob  loglift
2589      ax  61693.000000  61693.000000  Default  30.0000  30.0000
10300    max   4534.000000   4534.000000  Default  29.0000  29.0000
6911    file   1691.000000   1691.000000  Default  28.0000  28.0000
7348     g9v   1153.000000   1153.000

In [16]:
# Limit dataset for speed
documents = documents[:1500]

# Set plotly renderer
pio.renderers.default = "notebook"

# Create model
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    verbose=True
)


In [14]:


# Train model
topics, probs = topic_model.fit_transform(documents)

# Show topic summary
topic_info = topic_model.get_topic_info()
display(topic_info.head())



2026-02-18 17:10:20,950 - BERTopic - Embedding - Transforming documents to embeddings.


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

2026-02-18 17:11:58,940 - BERTopic - Embedding - Completed ✓
2026-02-18 17:11:58,942 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-18 17:12:09,502 - BERTopic - Dimensionality - Completed ✓
2026-02-18 17:12:09,505 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-18 17:12:09,629 - BERTopic - Cluster - Completed ✓
2026-02-18 17:12:09,640 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-18 17:12:10,481 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,385,-1_ax_the_to_max,"[ax, the, to, max, of, and, it, in, is, that]",[[This is a co-authored report from two of us ...
1,0,208,0_the_for_to_scsi,"[the, for, to, scsi, with, it, is, have, and, of]",[\n`My Western Digital also has three sets of ...
2,1,128,1_the_he_team_in,"[the, he, team, in, to, and, game, play, for, ...",[How long can the Leafs play short-handed and ...
3,2,126,2_the_of_is_that,"[the, of, is, that, to, not, and, it, in, jesus]",[[deletia- sig]\n\n[deletia- formalities]\n\nI...
4,3,73,3_the_car_to_it,"[the, car, to, it, in, and, is, for, you, my]",[an excellent automatic can be found in the su...


In [12]:
# Visualization
fig = topic_model.visualize_topics()
fig
