In [1]:
import os
import sys
import torch
from sklearn.datasets import fetch_20newsgroups

import topmost
from topmost.data import RawDataset
from topmost.preprocess import Preprocess
from topmost.trainers import BasicTrainer, FASTopicTrainer
from topmost.models import ECRTM

import gensim
from gensim import corpora
from gensim.models import LdaModel

docs = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))['data'][:1000]

preprocess = Preprocess(vocab_size=2000)
dataset_raw = RawDataset(docs, preprocess, device="cpu")

  from .autonotebook import tqdm as notebook_tqdm
loading train texts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3573.67it/s]
parsing texts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 8446.45it/s]
2025-12-29 10:13:48,566 - TopMost - Real vocab size: 2000
2025-12-29 10:13:48,567 - TopMost - Real training size: 1000 	 avg length: 45.851


In [2]:
# 1. LDA con Gensim
tokenized_docs = [doc.lower().split() for doc in docs if doc.strip()]
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus_gensim = [dictionary.doc2bow(text) for text in tokenized_docs]

lda_model = LdaModel(corpus=corpus_gensim, num_topics=10, id2word=dictionary, passes=10, iterations=100)
top_words_lda = lda_model.show_topics(num_topics=10, num_words=10, formatted=False)
print("\n1. LDA Top Words:")
for topic_id, words in top_words_lda:
    print(f"Topic {topic_id}: {[word for word, prob in words]}")


1. LDA Top Words:
Topic 0: [':', 'on', 'with', 'was', 'use', 'this', 'as', 'by', 'from', 'are']
Topic 1: ['they', 'with', 'was', 'are', 'not', 'had', 'you', 'would', 'on', 'be']
Topic 2: ['|', 'you', 'your', '/', 'if', 'be', 'have', 'can', 'or', '--']
Topic 3: ['-', 'you', 'not', 'but', 'we', 'as', 'with', 'are', 'have', 'be']
Topic 4: ['have', 'my', 'are', 'or', 'on', 'but', 'this', 'was', 'at', 'with']
Topic 5: ['you', 'this', 'not', 'be', 'are', 'have', 'as', '1', 'on', 'was']
Topic 6: ['from', 'not', '-', 'but', 'was', '1', 'by', 'spirit', 'has', 'this']
Topic 7: ['=', '-', 'on', 'space', '}', '*', 'with', 'shuttle', 'from', 'will']
Topic 8: ['with', 'be', 'on', 'have', 'this', 'you', 'can', 'if', 'not', 'but']
Topic 9: ['.', 'were', 'they', 'was', 'on', 'by', 'their', 'at', 'we', 'be']


In [3]:
# 2. FASTopic
trainer_fast = FASTopicTrainer(dataset_raw, num_topics=10, verbose=True)
top_words_fast, _ = trainer_fast.train()
print("\n2. FASTopic Top Words:", top_words_fast)

2025-12-29 10:13:58,677 - FASTopic - use device: cpu
2025-12-29 10:13:58,678 - FASTopic - First fit the model.
loading train texts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 13357.78it/s]
parsing texts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 16036.89it/s]
2025-12-29 10:14:06,932 - TopMost - Real vocab size: 2000
2025-12-29 10:14:06,932 - TopMost - Real training size: 1000 	 avg length: 45.851
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:08<00:00,  4.00it/s]
Training FASTopic:   4%|████▍                                                                                                           | 8/200 [00:00<00:09, 20.86it/s]2025-12-29 10:14:15,510 - FASTopic - Epoch: 010 loss: 348.238
Training FAS

Topic 0: program health data use users control number university line phone medical information public want mail
Topic 1: clinton treatment pain doctor soon tax banks body legal medicine economic diet skepticism surrender doctors
Topic 2: max memory scsi dos mac video radius simms ram card mode machine color window serial
Topic 3: armenian armenians turkish genocide government killed soviet war left children land russian city jews turks
Topic 4: crypto powerful marriage homosexual pure flame contradiction homosexuals topics differ escape african jew strip fundamental
Topic 5: jesus god argument believe true truth christian matthew bible example spirit think son conclusion point
Topic 6: engine car insurance cars keys speed gun bike self carry water miles turn gas oil
Topic 7: sale sell wondering price offer looking portable printer battery normal condition asking disks working box
Topic 8: season league players play team runs period los mike win encryption game games pittsburgh teams
T

In [4]:
# 3. ECRTM
model_ecrtm = ECRTM(vocab_size=preprocess.vocab_size, num_topics=10)
trainer_ecrtm = BasicTrainer(model_ecrtm, dataset_raw)
top_words_ecrtm, _ = trainer_ecrtm.train()
print("\n3. ECRTM Top Words:", top_words_ecrtm)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:14<00:00, 13.84it/s]


3. ECRTM Top Words: ['jesus people matthew radius god christians bible lord double faith passage king christian sin jewish', 'period play team games game pittsburgh jose players chicago los teams power season detroit san', 'health users volume reported persons united medical use report april washington page culture states age', 'father son spirit good excellent holy gun church missing cover council state poster included fair', 'armenian armenians turkish genocide russian soviet people turks muslim armenia killed population women argic government', 'insurance car chastity turbo gordon shameful banks thanks skepticism intellect rate surrender driving sale year', 'nasa shuttle mission space missions images orbit operations earth launch applications data science files military', 'max argument truth example bible conclusion true false god valid absolute belief occurs christians beliefs', 'windows scsi dos bios video memory controller guide disk ram motif ide motherboard microsoft mac', 'th


