In [5]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import re
import time

def clean_text(text):
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
texts = [clean_text(text) for text in newsgroups.data[:5000]]

vectorizer = CountVectorizer(
    max_features=3000,
    stop_words='english',
    min_df=5,
    max_df=0.75,
    token_pattern=r'\b[a-zA-Z]{3,}\b'
)
X = vectorizer.fit_transform(texts)
vocab = vectorizer.get_feature_names_out()
V = len(vocab)

M = X.shape[0]
rows, cols = X.nonzero()
counts = X.data.astype(int)

all_words = []
all_doc_ids = []
for i in range(len(rows)):
    all_words.extend([cols[i]] * counts[i])
    all_doc_ids.extend([rows[i]] * counts[i])

all_words = np.array(all_words, dtype=np.int32)
all_doc_ids = np.array(all_doc_ids, dtype=np.int32)
W = len(all_words)

K = 20
alpha = 0.1
beta = 0.01
n_iter = 70
beta_sum = beta * V

n_dk = np.zeros((M, K), dtype=np.int32)
n_kw = np.zeros((K, V), dtype=np.int32)
n_k = np.zeros(K, dtype=np.int32)

np.random.seed(42)
z = np.random.randint(0, K, size=W, dtype=np.int32)

for i in range(W):
    doc_id = all_doc_ids[i]
    word_id = all_words[i]
    topic = z[i]
    n_dk[doc_id, topic] += 1
    n_kw[topic, word_id] += 1
    n_k[topic] += 1

start_time = time.time()

for iteration in range(n_iter):
    order = np.random.permutation(W)
    for idx in order:
        doc_id = all_doc_ids[idx]
        word_id = all_words[idx]
        old_topic = z[idx]

        n_dk[doc_id, old_topic] -= 1
        n_kw[old_topic, word_id] -= 1
        n_k[old_topic] -= 1

        p = (n_dk[doc_id, :] + alpha) * (n_kw[:, word_id] + beta) / (n_k + beta_sum)
        p_sum = p.sum()

        if p_sum > 0:
            p /= p_sum
            new_topic = np.random.choice(K, p=p)
        else:
            new_topic = np.random.randint(0, K)

        z[idx] = new_topic
        n_dk[doc_id, new_topic] += 1
        n_kw[new_topic, word_id] += 1
        n_k[new_topic] += 1

    if (iteration + 1) % 10 == 0:
        print(f"Итерация {iteration + 1}/{n_iter}")

print(f"\nВремя выполнения: {time.time() - start_time:.1f} сек")

noise_words = {
    'wa', 'ha', 'doe', 'would', 'one', 'get', 'like', 'know', 'time', 'people',
    'say', 'think', 'just', 'don', 'good', 'way', 'really', 'right', 'going',
    'make', 'use', 'used', 'new', 'year', 'years', 'day', 'days', 'went',
    'said', 'did', 'told', 'saw', 'didn', 'left', 'does', 'doesn', 've', 'll'
}

def is_good_word(word):
    if len(word) < 3:
        return False
    if word in noise_words:
        return False
    if any(char.isdigit() for char in word) and len(word) <= 4:
        return False
    if re.match(r'^[a-z]{1,2}\d+', word) or re.match(r'^\d+[a-z]{1,2}$', word):
        return False
    return True

phi = (n_kw + beta) / (n_k[:, np.newaxis] + beta_sum)

print("\nТоп-10 слов по 20 темам:")
for k in range(K):
    top_indices = np.argsort(phi[k])[-25:][::-1]
    top_words_all = [vocab[i] for i in top_indices]

    filtered_words = []
    for word in top_words_all:
        if is_good_word(word):
            filtered_words.append(word)
        if len(filtered_words) >= 10:
            break

    if len(filtered_words) < 6:
        filtered_words = [w for w in top_words_all[:15] if len(w) >= 3][:10]

    if len(filtered_words) < 5:
        filtered_words = top_words_all[:10]

    print(f"Тема {k+1:2d}: {', '.join(filtered_words)}")

Итерация 10/70
Итерация 20/70
Итерация 30/70
Итерация 40/70
Итерация 50/70
Итерация 60/70
Итерация 70/70

Время выполнения: 663.7 сек

Топ-10 слов по 20 темам:
Тема  1: windows, dos, files, using, file, program, memory, package, problem, need
Тема  2: god, jesus, law, believe, world, life, matthew, christian, want, come
Тема  3: window, display, widget, application, set, value, using, data, null, color
Тема  4: launch, national, april, high, satellite, center, cost, york, washington, low
Тема  5: file, send, output, entry, line, program, check, read, article, list
Тема  6: true, question, bible, christians, things, faith, church, argument, truth, man
Тема  7: available, image, edu, graphics, ftp, version, software, sun, pub, code
Тема  8: max, giz, bhj, bxn, qax, chz, rlk, nrhj, tct, fpl
Тема  9: space, science, nasa, data, current, systems, ground, theory, large, design
Тема 10: internet, information, privacy, anonymous, public, mail, list, news, email, service
Тема 11: israel, eviden