**1. Topic Modeling Research Dashboard:**

• Take a dataset (20 Newsgroups or research articles).

• Train both LDA and NMF models.

• Visualize topics using pyLDAvis in a dashboard.

• Compare models with coherence & perplexity scores.


In [None]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(remove=('headers','footers','quotes'))
documents = data.data

In [None]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stop_words and len(w) > 3]
    return " ".join(words)

clean_docs = [preprocess(doc) for doc in documents]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X = vectorizer.fit_transform(clean_docs)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

In [None]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10, random_state=42)
nmf.fit(X)

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % topic_idx)
        print([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

display_topics(lda, vectorizer.get_feature_names_out(), 10)

In [None]:
import pyLDAvis
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
# Use lda_model.prepare for newer versions of pyLDAvis
panel = pyLDAvis.lda_model.prepare(lda, X, vectorizer)
panel