In [11]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(remove=('headers','footers','quotes'))
documents = data.data

In [12]:
import re

def clean(text):
    text = re.sub(r'\W', ' ', text)
    return text.lower()

documents = [clean(doc) for doc in documents]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# LDA → CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_count = count_vectorizer.fit_transform(documents)

# NMF → TFIDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(documents)

**Train LDA**

In [13]:
from sklearn.decomposition import LatentDirichletAllocation, NMF

# LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X_count)

# NMF
nmf = NMF(n_components=10, random_state=42)
nmf.fit(X_tfidf)

In [4]:
print("LDA Perplexity:", lda.perplexity(X_count))

LDA Perplexity: 3587.5007877099692


**Train NMF**

In [14]:
from sklearn.decomposition import LatentDirichletAllocation, NMF

# LDA
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X_count)

# NMF
nmf = NMF(n_components=10, random_state=42)
nmf.fit(X_tfidf)

In [15]:
def get_topics(model, feature_names, n_top_words=10):
    topics = []
    for topic in model.components_:
        top_words = [feature_names[i]
                     for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

**Coherence Score**

In [16]:
# LDA topics
lda_topics = get_topics(lda, count_vectorizer.get_feature_names_out())

# NMF topics
nmf_topics = get_topics(nmf, tfidf_vectorizer.get_feature_names_out())

In [17]:
!pip install gensim



In [18]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

texts = [doc.split() for doc in documents]
dictionary = Dictionary(texts)

# LDA coherence
lda_coherence = CoherenceModel(
    topics=lda_topics,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
).get_coherence()

# NMF coherence
nmf_coherence = CoherenceModel(
    topics=nmf_topics,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
).get_coherence()

print("LDA Coherence:", lda_coherence)
print("NMF Coherence:", nmf_coherence)

LDA Coherence: 0.6672437334892531
NMF Coherence: 0.753497578363007
