In [None]:
import numpy as np
import pandas as pd
import random
from collections import defaultdict
import re
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset
# 1 LOAD YELP REVIEWS DATASET
print("Loading Yelp dataset...")
dataset = load_dataset("yelp_review_full", split="train[:10%]")  # use 10% for speed
# Convert to DataFrame
df = pd.DataFrame(dataset)
# Simulate document (business) IDs and user IDs
df["asin"] = ["business_" + str(i % 4000) for i in range(len(df))]  # ~4k businesses
df["reviewText"] = df["text"]
df["reviewerID"] = ["user_" + str(i % 6000) for i in range(len(df))]  # ~6k users
df = df[["reviewerID", "asin", "reviewText"]].dropna()
print("✅ Yelp dataset loaded:", len(df), "reviews")

# 2 FILTER USERS AND DOCUMENTS
min_reviews_user = 2
min_reviews_doc = 2
user_counts = df['reviewerID'].value_counts()
doc_counts = df['asin'].value_counts()

users_keep = user_counts[user_counts >= min_reviews_user].index
docs_keep  = doc_counts[doc_counts >= min_reviews_doc].index

df2 = df[df['reviewerID'].isin(users_keep) & df['asin'].isin(docs_keep)].copy()
print("Filtered reviews:", len(df2))

# Relabel indices
user2idx = {u: i for i, u in enumerate(df2['reviewerID'].unique())}
doc2idx  = {d: i for i, d in enumerate(df2['asin'].unique())}

df2['u_idx'] = df2['reviewerID'].map(user2idx)
df2['d_idx'] = df2['asin'].map(doc2idx)

U = len(user2idx)
D = len(doc2idx)
print("Num users:", U, "Num documents:", D)

# =====================================
# 3 TEXT PREPROCESSING & VOCABULARY
# =====================================
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove numbers and punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text.split()

all_texts = df2['reviewText'].tolist()
vectorizer = CountVectorizer(max_features=2000, stop_words='english', tokenizer=preprocess)
X_counts = vectorizer.fit_transform(all_texts)
vocab = vectorizer.get_feature_names_out()
W = len(vocab)
print("Vocabulary size:", W)

word2idx = {w: i for i, w in enumerate(vocab)}

# =====================================
# 4 BUILD (u, d, w) TRIPLETS
# =====================================
triplet_counts = defaultdict(int)
for _, row in df2.iterrows():
    u = row['u_idx']
    d = row['d_idx']
    tokens = preprocess(row['reviewText'])
    for t in tokens:
        if t in word2idx:
            w = word2idx[t]
            triplet_counts[(u, d, w)] += 1

triplets = [(u, d, w, c) for (u, d, w), c in triplet_counts.items() if c > 0]
print("Num non-zero triplets:", len(triplets))

random.shuffle(triplets)
split = int(0.9 * len(triplets))
train_triplets = triplets[:split]
test_triplets  = triplets[split:]
print("Train size:", len(train_triplets), "Test size:", len(test_triplets))

# =====================================
# 5 TRIADIC PLSI TRAINING
# =====================================
def train_triadic_plsi(train_triplets, U, D, W, K=5, max_iter=20, eps=1e-12):
    Pz_u = np.random.rand(U, K)
    Pz_u /= Pz_u.sum(axis=1, keepdims=True)

    Pd_z = np.random.rand(K, D)
    Pd_z /= Pd_z.sum(axis=1, keepdims=True)

    Pw_z = np.random.rand(K, W)
    Pw_z /= Pw_z.sum(axis=1, keepdims=True)

    for it in range(max_iter):
        Q = []
        for (u, d, w, c) in train_triplets:
            val = Pz_u[u, :] * Pd_z[:, d] * Pw_z[:, w]
            s = val.sum()
            q = np.ones(K)/K if s < eps else val/s
            Q.append((u, d, w, c, q))

        Pz_u_new = np.zeros_like(Pz_u)
        Pd_z_new = np.zeros_like(Pd_z)
        Pw_z_new = np.zeros_like(Pw_z)

        for (u, d, w, c, q) in Q:
            Pz_u_new[u, :] += c * q
            Pd_z_new[:, d] += c * q
            Pw_z_new[:, w] += c * q

        # Normalize
        Pz_u = (Pz_u_new.T / (Pz_u_new.sum(axis=1) + eps)).T
        Pd_z = (Pd_z_new.T / (Pd_z_new.sum(axis=1) + eps)).T
        Pw_z = (Pw_z_new.T / (Pw_z_new.sum(axis=1) + eps)).T

        # Log-likelihood
        ll = 0.0
        for (u, d, w, c) in train_triplets:
            prob = np.sum(Pz_u[u, :] * Pd_z[:, d] * Pw_z[:, w])
            ll += c * np.log(prob + eps)
        print(f"Iter {it+1}/{max_iter}, train log-likelihood = {ll:.2f}")

    return Pz_u, Pd_z, Pw_z

K = 5
Pz_u, Pd_z, Pw_z = train_triadic_plsi(train_triplets, U, D, W, K=K, max_iter=15)

# =====================================
# 6 PERPLEXITY & EVALUATION
# =====================================
def compute_perplexity(triplets, Pz_u, Pd_z, Pw_z, eps=1e-12):
    N = len(triplets)
    ll = 0.0
    for (u, d, w, c) in triplets:
        prob = np.sum(Pz_u[u, :] * Pd_z[:, d] * Pw_z[:, w])
        ll += c * np.log(prob + eps)
    perp = np.exp(-ll / N)
    return ll, perp

ll_test, perp_test = compute_perplexity(test_triplets, Pz_u, Pd_z, Pw_z)
print("Triadic PLSI — Test log-likelihood = %.2f, Perplexity = %.2f" % (ll_test, perp_test))

# =====================================
# 7 TOP WORDS PER TOPIC
# =====================================
# Commented out the original topic printing
# top_n = 10
# for z in range(K):
#     top_w_idx = np.argsort(Pw_z[z, :])[-top_n:]
#     top_words = [vocab[w] for w in top_w_idx]
#     print(f"Topic {z}: {top_words}")

# =====================================
# 8 BASELINE VANILLA PLSI (DOC–WORD)
# =====================================
dw_counts = defaultdict(int)
for (u, d, w, c) in train_triplets:
    dw_counts[(d, w)] += c
dw_triplets = [(d, w, c) for (d, w), c in dw_counts.items()]

def train_plsi_doc_word(dw_triplets, D, W, K=5, max_iter=20, eps=1e-12):
    Pz_d = np.random.rand(D, K)
    Pz_d /= Pz_d.sum(axis=1, keepdims=True)
    Pw_z = np.random.rand(K, W)
    Pw_z /= Pw_z.sum(axis=1, keepdims=True)

    for it in range(max_iter):
        Q = []
        for (d, w, c) in dw_triplets:
            val = Pz_d[d, :] * Pw_z[:, w]
            s = val.sum()
            q = np.ones(K)/K if s < eps else val/s
            Q.append((d, w, c, q))

        Pz_d_new = np.zeros_like(Pz_d)
        Pw_z_new = np.zeros_like(Pw_z)

        for (d, w, c, q) in Q:
            Pz_d_new[d, :] += c * q
            Pw_z_new[:, w] += c * q

        Pz_d = (Pz_d_new.T / (Pz_d_new.sum(axis=1) + eps)).T
        Pw_z = (Pw_z_new.T / (Pw_z_new.sum(axis=1) + eps)).T

        ll = 0.0
        for (d, w, c) in dw_triplets:
            prob = np.sum(Pz_d[d, :] * Pw_z[:, w])
            ll += c * np.log(prob + eps)
        print(f"Vanilla PLSI iter {it+1}/{max_iter}, ll = {ll:.2f}")

    return Pz_d, Pw_z

Pz_d_bw, Pw_z_bw = train_plsi_doc_word(dw_triplets, D, W, K=K, max_iter=15)

# Evaluate on test
dw_test_counts = defaultdict(int)
for (u, d, w, c) in test_triplets:
    dw_test_counts[(d, w)] += c
dw_test_triplets = [(d, w, c) for (d, w), c in dw_test_counts.items()]

ll_bw = 0.0
for (d, w, c) in dw_test_triplets:
    prob = np.sum(Pz_d_bw[d, :] * Pw_z_bw[:, w])
    ll_bw += c * np.log(prob + 1e-12)
perp_bw = np.exp(-ll_bw / len(dw_test_triplets))
print("Baseline (Vanilla PLSI) — Test Perplexity = %.2f" % perp_bw)

# Call the function to get and print unique top words
unique_top_words = get_unique_top_words(Pw_z, vocab, top_n=10)
print("\nUnique Top Words per Topic (Triadic PLSI):")
for z in range(K):
    print(f"Topic {z}: {unique_top_words[z]}")