In [None]:
# ======================================================
# ðŸ”¹ PROBABILISTIC LATENT SEMANTIC INDEXING (PLSI)
#     using 20 Newsgroups dataset
# ======================================================

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

# ---------------------------
# 1. Load dataset
# ---------------------------
categories = ['rec.sport.baseball', 'sci.space', 'talk.politics.misc']
newsgroups = fetch_20newsgroups(subset='train', categories=categories,
                                remove=('headers', 'footers', 'quotes'))

docs = newsgroups.data[:300]   # use 300 documents for speed
print(f"Loaded {len(docs)} documents from 3 categories.")

# ---------------------------
# 2. Create term-document matrix
# ---------------------------
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(docs).toarray()
vocab = np.array(vectorizer.get_feature_names_out())

D, W = X.shape      # documents x words
K = 3               # number of latent topics

# ---------------------------
# 3. Initialize probabilities
# ---------------------------
np.random.seed(0)

# P(w|z)
P_w_z = np.random.rand(K, W)
P_w_z /= P_w_z.sum(axis=1, keepdims=True)

# P(z|d)
P_z_d = np.random.rand(D, K)
P_z_d /= P_z_d.sum(axis=1, keepdims=True)

# ---------------------------
# 4. EM Algorithm
# ---------------------------
n_iter = 30
for iteration in range(n_iter):
    # --- E-step ---
    P_z_dw = np.zeros((D, W, K))
    for d in range(D):
        for w in range(W):
            prob = P_w_z[:, w] * P_z_d[d, :]
            denom = prob.sum()
            if denom > 0:
                P_z_dw[d, w, :] = prob / denom

    # --- M-step ---
    # Update P(w|z)
    for z in range(K):
        for w in range(W):
            P_w_z[z, w] = np.sum(X[:, w] * P_z_dw[:, w, z])
        # Normalize
        P_w_z[z, :] /= P_w_z[z, :].sum()

    # Update P(z|d)
    for d in range(D):
        for z in range(K):
            P_z_d[d, z] = np.sum(X[d, :] * P_z_dw[d, :, z])
        # Normalize
        P_z_d[d, :] /= P_z_d[d, :].sum()

    if iteration % 5 == 0 or iteration == n_iter - 1:
        print(f"Iteration {iteration + 1}/{n_iter} completed.")

print("\nâœ… PLSI training complete!\n")

# ---------------------------
# 5. Display Top Words per Topic
# ---------------------------
n_top_words = 10
for z in range(K):
    top_idx = P_w_z[z, :].argsort()[-n_top_words:][::-1]
    print(f"Topic {z+1}: {', '.join(vocab[top_idx])}")

# ---------------------------
# 6. Documentâ€“Topic Distribution
# ---------------------------
doc_topic_df = pd.DataFrame(P_z_d, columns=[f"Topic {i+1}" for i in range(K)])
print("\nDocumentâ€“Topic distribution (first 10 docs):\n")
print(doc_topic_df.head(10).round(3))


Loaded 300 documents from 3 categories.


  P_z_d[d, :] /= P_z_d[d, :].sum()


Iteration 1/30 completed.
Iteration 6/30 completed.
Iteration 11/30 completed.
Iteration 16/30 completed.
Iteration 21/30 completed.
Iteration 26/30 completed.
Iteration 30/30 completed.

âœ… PLSI training complete!

Topic 1: space, tax, station, games, use, 000, 333, gun, just, russian
Topic 2: space, edu, nasa, available, data, 02, information, colorado, won, 03
Topic 3: think, don, just, people, good, like, know, year, work, time

Documentâ€“Topic distribution (first 10 docs):

   Topic 1  Topic 2  Topic 3
0    0.000    1.000    0.000
1    0.000    0.000    1.000
2    0.010    0.000    0.990
3    0.301    0.323    0.375
4    0.631    0.000    0.369
5    0.000    0.944    0.056
6    1.000    0.000    0.000
7    0.693    0.055    0.252
8    0.715    0.285    0.000
9    0.000    0.276    0.724
