In [None]:
from sklearn.datasets import fetch_20newsgroups

news_bundle = fetch_20newsgroups(
    subset='train',   # only training data
    remove=('headers', 'footers', 'quotes')
)

raw_articles = news_bundle.data[:2000]  # limit size


In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
common_words = set(stopwords.words('english'))

def clean_text_pipeline(content):
    content = re.sub(r'[^a-zA-Z]', ' ', content)
    content = content.lower()
    tokens = content.split() 
    refined_tokens = [token for token in tokens 
                      if token not in common_words and len(token) > 3]
    return " ".join(refined_tokens)

processed_articles = [clean_text_pipeline(text) for text in raw_articles]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
bow_vectorizer = CountVectorizer(max_df=0.85, min_df=15)
term_matrix = bow_vectorizer.fit_transform(processed_articles)

In [None]:
lda_model_v2 = LatentDirichletAllocation(
    n_components=10,
    random_state=101
)

In [None]:
lda_model_v2.fit(term_matrix)

0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",10
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [None]:
lda_perp_score = lda_model_v2.perplexity(term_matrix)
print("LDA Perplexity Score:", lda_perp_score)

LDA Perplexity Score: 943.1316042330686


NMF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
tfidf_encoder = TfidfVectorizer(max_df=0.85, min_df=15)
tfidf_features = tfidf_encoder.fit_transform(processed_articles)

In [None]:
nmf_model_v2 = NMF(
    n_components=10,
    random_state=101,
    init='nndsvd'
)

In [None]:

nmf_model_v2.fit(tfidf_features)



0,1,2
,"n_components  n_components: int or {'auto'} or None, default='auto' Number of components. If `None`, all features are kept. If `n_components='auto'`, the number of components is automatically inferred from W or H shapes. .. versionchanged:: 1.4  Added `'auto'` value. .. versionchanged:: 1.6  Default value changed from `None` to `'auto'`.",10
,"init  init: {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None Method used to initialize the procedure. Valid options: - `None`: 'nndsvda' if n_components <= min(n_samples, n_features),  otherwise random. - `'random'`: non-negative random matrices, scaled with:  `sqrt(X.mean() / n_components)` - `'nndsvd'`: Nonnegative Double Singular Value Decomposition (NNDSVD)  initialization (better for sparseness) - `'nndsvda'`: NNDSVD with zeros filled with the average of X  (better when sparsity is not desired) - `'nndsvdar'` NNDSVD with zeros filled with small random values  (generally faster, less accurate alternative to NNDSVDa  for when sparsity is not desired) - `'custom'`: Use custom matrices `W` and `H` which must both be provided. .. versionchanged:: 1.1  When `init=None` and n_components is less than n_samples and n_features  defaults to `nndsvda` instead of `nndsvd`.",'nndsvd'
,"solver  solver: {'cd', 'mu'}, default='cd' Numerical solver to use: - 'cd' is a Coordinate Descent solver. - 'mu' is a Multiplicative Update solver. .. versionadded:: 0.17  Coordinate Descent solver. .. versionadded:: 0.19  Multiplicative Update solver.",'cd'
,"beta_loss  beta_loss: float or {'frobenius', 'kullback-leibler', 'itakura-saito'}, default='frobenius' Beta divergence to be minimized, measuring the distance between X and the dot product WH. Note that values different from 'frobenius' (or 2) and 'kullback-leibler' (or 1) lead to significantly slower fits. Note that for beta_loss <= 0 (or 'itakura-saito'), the input matrix X cannot contain zeros. Used only in 'mu' solver. .. versionadded:: 0.19",'frobenius'
,"tol  tol: float, default=1e-4 Tolerance of the stopping condition.",0.0001
,"max_iter  max_iter: int, default=200 Maximum number of iterations before timing out.",200
,"random_state  random_state: int, RandomState instance or None, default=None Used for initialisation (when ``init`` == 'nndsvdar' or 'random'), and in Coordinate Descent. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",101
,"alpha_W  alpha_W: float, default=0.0 Constant that multiplies the regularization terms of `W`. Set it to zero (default) to have no regularization on `W`. .. versionadded:: 1.0",0.0
,"alpha_H  alpha_H: float or ""same"", default=""same"" Constant that multiplies the regularization terms of `H`. Set it to zero to have no regularization on `H`. If ""same"" (default), it takes the same value as `alpha_W`. .. versionadded:: 1.0",'same'
,"l1_ratio  l1_ratio: float, default=0.0 The regularization mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an elementwise L2 penalty (aka Frobenius Norm). For l1_ratio = 1 it is an elementwise L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. .. versionadded:: 0.17  Regularization parameter *l1_ratio* used in the Coordinate Descent  solver.",0.0


Coherence Score

In [None]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [None]:
tokenized_corpus = [doc.split() for doc in processed_articles]

In [None]:
vocab_dictionary = Dictionary(tokenized_corpus)
bow_corpus = [vocab_dictionary.doc2bow(text) for text in tokenized_corpus]

In [None]:
coherence_eval = CoherenceModel(
    topics=[[bow_vectorizer.get_feature_names_out()[i] 
            for i in topic.argsort()[-10:]]
            for topic in lda_model_v2.components_],
    texts=tokenized_corpus,
    dictionary=vocab_dictionary,
    coherence='c_v'
)

In [None]:
print("Coherence Score:", coherence_eval.get_coherence())

Coherence Score: 0.5082340337731592


pyLDAvis Visualization

In [None]:
import pyLDAvis
import pyLDAvis.lda_model

In [None]:
pyLDAvis.enable_notebook()

In [None]:
lda_visual = pyLDAvis.lda_model.prepare(
    lda_model_v2,
    term_matrix,
    bow_vectorizer
)

In [None]:
lda_visual

BERTopic

In [None]:
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
semantic_topic_engine = BERTopic(verbose=True)
topic_labels, topic_probs = semantic_topic_engine.fit_transform(processed_articles)

2026-02-18 12:06:54,730 - BERTopic - Embedding - Transforming documents to embeddings.
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 550.98it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|██████████| 63/63 [00:32<00:00,  1.91it/s]
2026-02-18 12:07:31,861 - BERTopic - Embedding - Completed ✓
2026-02-18 12:07:31,861 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-18 12:07:46,002 - BERTopic - Dimensionality - Completed ✓
2026-02-18 12:07:46,004 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-18 12:07:46,122 - BERTopic - Cluster - Completed ✓
2026-02

In [None]:
semantic_topic_engine.visualize_topics()