In [23]:
# Topic Modeling Research Dashboard


In [38]:
#  Install Required Libraries (run once)
!pip install numpy pandas scikit-learn matplotlib seaborn pyLDAvis gensim nltk spacy umap-learn bertopic



In [25]:
#  Imports

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from bertopic import BERTopic


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# Load Dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = data.data
print(f"Total documents: {len(documents)}")


Total documents: 18846


In [27]:
#  Preprocessing

stop_words = stopwords.words('english')

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'\W+', ' ', text)
    return text

documents_clean = [preprocess(doc) for doc in documents]

In [28]:
# Tokenize for gensim coherence later
tokenized_docs = [doc.split() for doc in documents_clean]

In [29]:
# Create Dictionary and Corpus for Gensim

dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]


In [30]:
#  LDA using Gensim (recommended for pyLDAvis)

num_topics = 10
lda_gensim = gensim.models.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=num_topics,
                                   random_state=42,
                                   passes=10,
                                   alpha='auto',
                                   per_word_topics=True)

In [31]:

#  Evaluate LDA

coherence_lda = CoherenceModel(model=lda_gensim, texts=tokenized_docs, dictionary=dictionary, coherence='c_v').get_coherence()
print(f"LDA Coherence Score: {coherence_lda:.4f}")


LDA Coherence Score: 0.5626


In [32]:
# Visualize LDA with pyLDAvis

panel = gensimvis.prepare(lda_gensim, corpus, dictionary)
panel  # In Jupyter, this is interactive

In [33]:
#  NMF using TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents_clean)

nmf_model = NMF(n_components=num_topics, random_state=42)
nmf_model.fit(tfidf_matrix)

# Display top words per topic for NMF
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx+1}: ", " | ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out(), num_top_words)


Topic 1:  would | one | like | think | get | good | time | much | well | know
Topic 2:  thanks | please | anyone | mail | know | advance | hi | looking | email | info
Topic 3:  god | jesus | bible | believe | christ | faith | christian | christians | sin | us
Topic 4:  drive | scsi | ide | card | disk | controller | hard | drives | bus | floppy
Topic 5:  game | games | team | year | hockey | baseball | last | season | play | players
Topic 6:  windows | dos | file | program | files | window | use | using | version | run
Topic 7:  00 | 10 | new | sale | price | 50 | 20 | shipping | 15 | offer
Topic 8:  key | chip | encryption | clipper | keys | government | escrow | system | algorithm | use
Topic 9:  edu | geb | dsl | cadre | n3jxp | chastity | pitt | skepticism | intellect | shameful
Topic 10:  people | government | israel | armenian | jews | armenians | gun | state | rights | children


In [34]:
#  BERTopic (Transformer-based)

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents_clean)

# Visualize BERTopic topics
topic_model.visualize_topics()



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
