In [1]:
import numpy as np
import pandas as pd
import nltk
import pyLDAvis
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
dataset = fetch_20newsgroups(remove=('headers','footers','quotes'))
documents = dataset.data[:1000]   

print("Total documents:", len(documents))


Total documents: 1000


In [6]:
count_vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=5)
count_data = count_vectorizer.fit_transform(documents)


In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)
tfidf_data = tfidf_vectorizer.fit_transform(documents)


In [8]:
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(count_data)

nmf_model = NMF(n_components=10, random_state=42)
nmf_model.fit(tfidf_data)


0,1,2
,n_components,10
,init,
,solver,'cd'
,beta_loss,'frobenius'
,tol,0.0001
,max_iter,200
,random_state,42
,alpha_W,0.0
,alpha_H,'same'
,l1_ratio,0.0


In [9]:
def display_topics(model, feature_names, no_top_words=10):
    topics_list = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words = [feature_names[i] 
                       for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics_list.append(topic_words)
        print(f"\nTopic {topic_idx+1}:")
        print(" ".join(topic_words))
    return topics_list

print("===== LDA Topics =====")
lda_topics = display_topics(lda_model, count_vectorizer.get_feature_names_out())

print("\n===== NMF Topics =====")
nmf_topics = display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())



===== LDA Topics =====

Topic 1:
max 145 04 tm p2 34 th ma 45 dos

Topic 2:
time just new car year think like greek good know

Topic 3:
jesus god people does know argument believe think true say

Topic 4:
armenian turkish armenians space nasa people genocide p2 government 000

Topic 5:
know don think moral help people use thanks morality problem

Topic 6:
don problem just know use card thanks does think team

Topic 7:
people like don just said right know gun left look

Topic 8:
windows like files program file pc data code microsoft using

Topic 9:
00 good 50 10 15 period 20 pp 11 excellent

Topic 10:
health use 1993 edu com information father son medical users

===== NMF Topics =====

Topic 1:
don think know just people like really way want good

Topic 2:
windows program files file ftp image run code pc window

Topic 3:
armenian armenians turkish genocide soviet people government armenia turks russian

Topic 4:
god truth believe christian moral absolute faith bible know belief

Topic 5

In [10]:
print("LDA Perplexity:", lda_model.perplexity(count_data))


LDA Perplexity: 1750.8273336863276


In [11]:
texts = [doc.split() for doc in documents]
dictionary = Dictionary(texts)

coherence_lda = CoherenceModel(
    topics=lda_topics,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)

print("LDA Coherence Score:", coherence_lda.get_coherence())

coherence_nmf = CoherenceModel(
    topics=nmf_topics,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)

print("NMF Coherence Score:", coherence_nmf.get_coherence())


LDA Coherence Score: 0.3731797603822794
NMF Coherence Score: 0.4443292572695654


In [13]:
import pyLDAvis

pyLDAvis.enable_notebook()

# Topic-term distribution
topic_term_dists = lda_model.components_ / lda_model.components_.sum(axis=1)[:, None]

# Document-topic distribution
doc_topic_dists = lda_model.transform(count_data)

# Document lengths
doc_lengths = count_data.sum(axis=1).A1

# Vocabulary
vocab = count_vectorizer.get_feature_names_out()

# Term frequencies
term_frequency = count_data.sum(axis=0).A1

lda_vis = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency
)

lda_vis
