In [17]:
import os
import sys
import torch
from sklearn.datasets import fetch_20newsgroups

import topmost
from topmost.data import RawDataset
from topmost.preprocess import Preprocess
from topmost.trainers import BasicTrainer, FASTopicTrainer
from topmost.models import ECRTM

import gensim
from gensim import corpora
from gensim.models import LdaModel

docs = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))['data'][:1000]

preprocess = Preprocess(vocab_size=2000)
dataset_raw = RawDataset(docs, preprocess, device="cpu")

loading train texts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3862.81it/s]
parsing texts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 9781.58it/s]
2025-12-29 09:33:55,656 - TopMost - Real vocab size: 2000
2025-12-29 09:33:55,656 - TopMost - Real training size: 1000 	 avg length: 45.852


In [18]:
# 1. LDA con Gensim
tokenized_docs = [doc.lower().split() for doc in docs if doc.strip()]
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus_gensim = [dictionary.doc2bow(text) for text in tokenized_docs]

lda_model = LdaModel(corpus=corpus_gensim, num_topics=10, id2word=dictionary, passes=10, iterations=100)
top_words_lda = lda_model.show_topics(num_topics=10, num_words=10, formatted=False)
print("\n1. LDA Top Words:")
for topic_id, words in top_words_lda:
    print(f"Topic {topic_id}: {[word for word, prob in words]}")


1. LDA Top Words:
Topic 0: [':', 'on', 'with', 'was', 'use', 'this', 'as', 'by', 'from', 'are']
Topic 1: ['they', 'with', 'was', 'are', 'not', 'had', 'you', 'would', 'on', 'be']
Topic 2: ['|', 'you', 'your', '/', 'if', 'be', 'have', 'can', 'or', '--']
Topic 3: ['-', 'you', 'not', 'but', 'we', 'as', 'with', 'are', 'have', 'be']
Topic 4: ['have', 'my', 'are', 'or', 'on', 'but', 'this', 'was', 'at', 'with']
Topic 5: ['you', 'this', 'not', 'be', 'are', 'have', 'as', '1', 'on', 'was']
Topic 6: ['from', 'not', '-', 'but', 'was', '1', 'by', 'spirit', 'has', 'this']
Topic 7: ['=', '-', 'on', 'space', '}', '*', 'with', 'shuttle', 'from', 'will']
Topic 8: ['with', 'be', 'on', 'have', 'this', 'you', 'can', 'if', 'not', 'but']
Topic 9: ['.', 'were', 'they', 'was', 'on', 'by', 'their', 'at', 'we', 'be']


In [19]:
# 2. FASTopic
trainer_fast = FASTopicTrainer(dataset_raw, num_topics=10, verbose=True)
top_words_fast, _ = trainer_fast.train()
print("\n2. FASTopic Top Words:", top_words_fast)

2025-12-29 09:34:05,304 - FASTopic - use device: cpu
2025-12-29 09:34:05,306 - FASTopic - First fit the model.
loading train texts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 10577.76it/s]
parsing texts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 16118.67it/s]
2025-12-29 09:34:08,872 - TopMost - Real vocab size: 2000
2025-12-29 09:34:08,873 - TopMost - Real training size: 1000 	 avg length: 45.852
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:06<00:00,  5.11it/s]
Training FASTopic:   2%|██▊                                                                                                             | 5/200 [00:00<00:04, 41.94it/s]2025-12-29 09:34:15,472 - FASTopic - Epoch: 010 loss: 348.586
Training FAS

Topic 0: jesus god argument truth matthew christian bible son spirit conclusion believe true example church faith
Topic 1: simms radius window memory screen color motif bios display clock machine problems running dos write
Topic 2: scsi space shuttle mac video nasa launch orbit mission cable sale offer price asking board
Topic 3: max windows program data health file information users address files systems medical mail images access
Topic 4: good got time dont didnt look sure going like think right thats thing better gun
Topic 5: doctor soon banks gordon skepticism doctors diet chastity intellect shameful surrender effective flame trip thank
Topic 6: season players league play mike team runs teams los pittsburgh period clinton toronto win san
Topic 7: armenian armenians turkish genocide killed soviet russian war turks muslim population army government jews today
Topic 8: insurance keys cars engine air speed bike car rate miles oil water model rates driving
Topic 9: values effect sex the

In [20]:
# 3. ECRTM
model_ecrtm = ECRTM(vocab_size=preprocess.vocab_size, num_topics=10)
trainer_ecrtm = BasicTrainer(model_ecrtm, dataset_raw)
top_words_ecrtm, _ = trainer_ecrtm.train()
print("\n3. ECRTM Top Words:", top_words_ecrtm)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:15<00:00, 12.94it/s]



3. ECRTM Top Words: ['jesus matthew people armenians father lord city armenian gun passage burned said israel soldiers king', 'scsi files radius dos images bios ram windows disk floppy mac motif directory video gif', 'max health mission shuttle nasa period windows los missions space orbit chicago launch pittsburgh play', 'thanks sale portable looking printer battery video card display brand price adapter cpu recommend server', 'argument father conclusion true son example spirit false bible truth valid god holy faith church', 'thanks mon printer sale looking portable video cpu display devils pittsburgh shameful chastity battery intellect', 'excellent missing good cover poster fair included gods indicates pressure condition index update flow game', 'thanks sale video printer portable looking card display battery color cpu ram brand ide appreciated', 'armenian turkish armenians people genocide turks health soviet russian killed muslim armenia army women argic', 'jesus truth god absolute 