In [11]:
import torch
from sklearn.datasets import fetch_20newsgroups

import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess

import topmost
from topmost.data import RawDataset
from topmost.preprocess import Preprocess
from topmost.trainers import BasicTrainer, FASTopicTrainer
from topmost.models import ECRTM

In [12]:
K = 10
topn = 10

def showTopicsLDA():
    for topic_id in range(K):
        words = [word for word, _ in lda_model.show_topic(topic_id, topn=topn)]
        print(f"Topic {topic_id}: {' '.join(words)}")

def showTopicsFASTopic():
    for topic_id in range(K):
        words = top_words_fastopic[topic_id].split()
        print(f"Topic {topic_id}: {' '.join(words)}")

def showTopicsECRTM():
    for topic_id in range(K):
        words = top_words_ecrtm[topic_id].split()
        print(f"Topic {topic_id}: {' '.join(words)}")

In [13]:
# Cargar datos
docs = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes')
)['data'][:1000]

print(f"Documentos cargados: {len(docs)}")

# Preprocesamiento estándar
def preprocess_text(text):
    tokens = simple_preprocess(text, deacc=True, min_len=3, max_len=15)
    tokens = [token for token in tokens if token not in STOPWORDS]
    tokens = [token for token in tokens if not token.isnumeric()]
    return tokens

print("\nPreprocesando documentos...")
tokenized_docs = [preprocess_text(doc) for doc in docs]
tokenized_docs = [doc for doc in tokenized_docs if len(doc) > 0]
print(f"Documentos restantes tras preprocesamiento: {len(tokenized_docs)}")

dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus_gensim = [dictionary.doc2bow(text) for text in tokenized_docs]

# Preprocesamiento estándar de TopMost
preprocess = Preprocess(vocab_size=2000)
dataset_raw = RawDataset(docs, preprocess, device="cpu")

Documentos cargados: 1000

Preprocesando documentos...
Documentos restantes tras preprocesamiento: 972


loading train texts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3854.34it/s]
parsing texts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 9822.01it/s]
2025-12-30 16:55:33,542 - TopMost - Real vocab size: 2000
2025-12-30 16:55:33,544 - TopMost - Real training size: 1000 	 avg length: 45.852


In [14]:
# 1. LDA

lda_model = LdaModel(corpus=corpus_gensim, num_topics=K, id2word=dictionary, passes=15, iterations=100, random_state=42, alpha='symmetric', eta='auto')
print("\n=== Tópicos LDA ===")
showTopicsLDA()


=== Tópicos LDA ===
Topic 0: max period play power second vancouver mhz motif like louis
Topic 1: windows know cable software code problem moral like shuttle pain
Topic 2: use thanks like know files disk need time program help
Topic 3: jesus god people think matthew good time know said man
Topic 4: use health father son year years state spirit medical car
Topic 5: space nasa windows use com scsi program shuttle like data
Topic 6: good excellent think missing new fair cover israel issue land
Topic 7: edu time like memory know bit people dos video lot
Topic 8: armenian people armenians turkish genocide government soviet killed said russian
Topic 9: people argument true know like example believe think god said


In [15]:
# 2. FASTopic
trainer_fastopic = FASTopicTrainer(dataset_raw, num_topics=K, num_top_words=topn)
top_words_fastopic, _ = trainer_fastopic.train()
print("\n=== Tópicos FASTopic ===")
showTopicsFASTopic()

loading train texts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 11563.28it/s]
parsing texts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 16317.39it/s]
Training FASTopic: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:11<00:00, 18.17it/s]


=== Tópicos FASTopic ===
Topic 0: max armenian armenians turkish genocide killed soviet war russian turks
Topic 1: jesus god believe argument true way think said man course
Topic 2: season players league team play mike runs teams los period
Topic 3: space data nasa shuttle files address mail images list information
Topic 4: scsi memory video mac radius dos simms ram machine card
Topic 5: sin moral humans morality religion sex agree sense belief exist
Topic 6: engine bike air values miles gas oil water normal speed
Topic 7: keys clipper sale sell price cable phone offer asking chip
Topic 8: health gun car states state medical law defense public control
Topic 9: curious shameful chastity intellect trip wow aid skepticism bullshit surrender





In [16]:
# 3. ECRTM
model_ecrtm = ECRTM(vocab_size=preprocess.vocab_size, num_topics=10)
trainer_ecrtm = BasicTrainer(model_ecrtm, dataset_raw, num_top_words=topn)
top_words_ecrtm, _ = trainer_ecrtm.train()
print("\n=== Tópicos ECRTM ===")
showTopicsECRTM()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:16<00:00, 11.93it/s]


=== Tópicos ECRTM ===
Topic 0: god moral morality values sex humans dont jesus truth pain
Topic 1: health people armenians armenian father gun users use persons state
Topic 2: max period play pittsburgh los chicago jose mon detroit power
Topic 3: jobs windows nasa mission shuttle images space launch orbit missions
Topic 4: scsi controller windows bios ram dos motherboard guide video mac
Topic 5: excellent good missing game cover fair games poster team season
Topic 6: government nsa sorts israel chip encryption palestinians clipper keys attacks
Topic 7: armenian turkish armenians genocide people turks soviet muslim killed russian
Topic 8: jesus argument matthew people god conclusion son truth bible spirit
Topic 9: thanks sale printer chastity shameful intellect battery skepticism looking ide



