In [1]:
!pip install scikit-learn
!pip install nltk
!pip install gensim
!pip install pyLDAvis
!pip install bertopic
!pip install umap-learn


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import re
import nltk
import pickle
import numpy as np
import pyLDAvis
pyLDAvis.enable_notebook()

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from bertopic import BERTopic


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print("Total documents:", len(documents))


Total documents: 11314


In [5]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 3]
    return " ".join(tokens)

cleaned_docs = [preprocess(doc) for doc in documents]
print("Sample preprocessed doc:\n", cleaned_docs[0][:200])


Sample preprocessed doc:
 wondering anyone could enlighten door sports looked late early called bricklin doors really small addition front bumper separate rest body know anyone tellme model name engine specs years production m


In [6]:
# For LDA → CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2)
dtm = count_vectorizer.fit_transform(cleaned_docs)

# For NMF → TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf = tfidf_vectorizer.fit_transform(cleaned_docs)


In [7]:
n_topics = 10

lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)
print("LDA Perplexity:", lda.perplexity(dtm))

# Document-topic distribution for pyLDAvis
doc_topic_dists = lda.transform(dtm)


LDA Perplexity: 5336.914279211712


In [8]:
nmf = NMF(n_components=n_topics, random_state=42)
nmf.fit(tfidf)


0,1,2
,n_components,10
,init,
,solver,'cd'
,beta_loss,'frobenius'
,tol,0.0001
,max_iter,200
,random_state,42
,alpha_W,0.0
,alpha_H,'same'
,l1_ratio,0.0


In [9]:
texts = [doc.split() for doc in cleaned_docs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Extract top words per topic
lda_topics = []
for topic in lda.components_:
    words = [count_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]]
    lda_topics.append(words)

coherence_model_lda = CoherenceModel(topics=lda_topics, texts=texts, dictionary=dictionary, coherence='c_v')
print("LDA Coherence Score:", coherence_model_lda.get_coherence())


LDA Coherence Score: 0.5757592723222666


In [10]:
lda_vis = pyLDAvis.prepare(
    topic_term_dists=lda.components_,
    doc_topic_dists=doc_topic_dists,
    vocab=count_vectorizer.get_feature_names_out(),
    doc_lengths=dtm.sum(axis=1).A1,
    term_frequency=dtm.sum(axis=0).A1
)
lda_vis


In [11]:
# Save models
with open("lda_model.pkl", "wb") as f:
    pickle.dump(lda, f)

with open("nmf_model.pkl", "wb") as f:
    pickle.dump(nmf, f)

# Save LDA visualization as HTML
pyLDAvis.save_html(lda_vis, "lda_dashboard.html")
print("Models and visualization saved!")


Models and visualization saved!


In [12]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(cleaned_docs)
topic_model.visualize_topics()


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
