## Cluster-based recommendation model
Steps followed:
1. Load reviews dataset, keep only ratings >=8 and build user baskets (list of positively rated games per user, filering users with at least 2 high ratings)
2. Load game metadata and spectral clustering assigments, merge them by game name, and build mapping from game id to cluster label
3. Compute global popularity for each game (number positive reviews), and for each cluster create a list of games sorted by global popularity
4. For each user, split their basket into future(30%) and history (70%), identify clusters that contain most of their history games and recommend up to k popular games from those clusters that the user has not played
5. Evaluate the model using precision, recall and F1 for k=[1,3,5,7,10]

In [1]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path

DATA = Path("./data")
GEN_DATA = DATA / "gen"
RAW_DATA = DATA / "raw"

# Load data with reviews >= 8
reviews = pl.read_csv(DATA / "bgg-26m-reviews.csv")
reviews = reviews.filter(pl.col("rating") >= 8)

user_baskets = reviews.group_by("user").agg(
    pl.len().alias("basket_size"),
    pl.col("ID"),
)
print(f"Number of users: {user_baskets.height}")
# Filter users with at least 5 high ratings
user_baskets = user_baskets.filter(pl.col("basket_size") > 4)
print(f"Number of users with at least 5 high ratings: {user_baskets.height}")

# Load game metadata (id+name)
games = pl.read_csv(RAW_DATA / "games_detailed_info2025.csv")
games_meta = games.select(["id", "name"]).to_pandas()

# Load spectral clustering assignments
cluster_assignments = pd.read_csv(GEN_DATA / "spectral_cluster_labels_matrix1.csv")
# "game" column has format "0_Muffin Time" -> extract name part
cluster_assignments["name"] = cluster_assignments["game"].str.split("_", n=1).str[1]

# Match clusters to game IDs via name
cluster_with_ids = cluster_assignments.merge(games_meta, on="name", how="inner")
print(f"Rows in cluster_assignments: {len(cluster_assignments)}")
print(f"Rows after merge with games_meta: {len(cluster_with_ids)}")

# Ensure unique (one cluster per game id)
ids_in_reviews = set(reviews["ID"].unique())
cluster_with_ids = cluster_with_ids[cluster_with_ids["id"].isin(ids_in_reviews)]
cluster_with_ids = cluster_with_ids.drop_duplicates(subset="id")

# Final mapping: game_id -> cluster label
game_to_cluster = dict(zip(cluster_with_ids["id"], cluster_with_ids["cluster"]))
n_clusters = cluster_with_ids["cluster"].max() + 1
print(f"Loaded cluster assignments for {len(game_to_cluster)} games and {n_clusters} clusters")

Number of users: 536829
Number of users with at least 5 high ratings: 312352
Rows in cluster_assignments: 11112
Rows after merge with games_meta: 11586
Loaded cluster assignments for 11292 games and 400 clusters


In [2]:
from collections import defaultdict

# Global popularity of each game (number positive reviews per game)
game_popularity_pl = (
    reviews
    .group_by("ID")
    .len()
    .rename({"len": "popularity"})
)
game_popularity = game_popularity_pl.to_pandas()
#game_id -> popularity (count of positive reviews)
game_popularity_dict  = dict(zip(game_popularity["ID"], game_popularity["popularity"]))

# cluster -> list of games sorted by popularity 
cluster_to_games = defaultdict(list)

# Assign each game_id to its cluster 
for game_id, cluster_label in game_to_cluster.items():
    cluster_to_games[cluster_label].append(game_id)
# Sort games inside each cluster by global popularity (desc)
for cluster_label, game_list in cluster_to_games.items():
    game_list.sort(key=lambda gid: game_popularity_dict.get(gid, 0), reverse=True)

print("Prepared cluster to games mapping, games ordered by popularity")

Prepared cluster to games mapping, games ordered by popularity


In [3]:
def recommend_from_clusters_for_user(all_games, game_to_cluster, cluster_to_items, k):
    """
    Recommend games for one user using the clusters

    all_games: list of game IDs (full basket for that user)
    game_to_cluster: dict game_id -> cluster
    cluster_to_items: dict cluster -> list of game_ids sorted by popularity
    k: number of recommendations
    """
    length = len(all_games)
    if length == 0:
        return []

    # Same split as in the other models: first 30% = future, last 70% = history
    future = int(np.ceil(0.3 * length))
    history = all_games[future:]
    history_set = set(history)

    # Clusters where the user already has positive games
    history_clusters = [game_to_cluster[g] for g in history if g in game_to_cluster]

    if not history_clusters:
        # None of the user’s games are in our clustered subset
        return []

    # Count how many history games fall into each cluster
    unique_c, counts_c = np.unique(history_clusters, return_counts=True)
    # Sort clusters by how “important” they are for this user (most frequent first)
    ordered_clusters = unique_c[np.argsort(-counts_c)]

    recommendations = []
    seen = set(history_set)

    # Go through the main clusters and pick popular unseen games
    for c in ordered_clusters:
        for gid in cluster_to_items.get(int(c), []):
            if gid in seen:
                continue
            if gid in recommendations:
                continue
            recommendations.append(gid)
            if len(recommendations) == k:
                return recommendations

    # Might return fewer than k if there are not enough candidates
    return recommendations 


def make_prediction_cluster_based(baskets_df, game_to_cluster, cluster_to_items, k=10):
    """
    Run the cluster-based recommender for all users

    baskets_df: polars DataFrame like user_baskets (columns: "user", "basket_size", "ID")
    game_to_cluster, cluster_to_items: built before
    k: how many items to recommend per user
    """
    recommended_items_all = {}
    skipped_users = 0
    too_short_counter = 0

    for row_idx, row in enumerate(baskets_df.iter_rows()):
        user_id = row[0]
        all_games = row[2]  # list of game IDs for this user

        if len(all_games) == 0:
            # no games at all for this user
            skipped_users += 1
            continue

        # Get recommendations for this user
        recs = recommend_from_clusters_for_user(all_games, game_to_cluster, cluster_to_items, k)

        if len(recs) < k:
            # not enough candidates 
            too_short_counter += 1

        recommended_items_all[user_id] = recs

    if skipped_users > 0:
        print(f"Skipped {skipped_users} users (empty baskets).")
    if too_short_counter > 0:
        print(f"Could not generate {k} recommendations for {too_short_counter} users (not enough candidates).")

    return recommended_items_all



In [4]:
def precision_at_k(recommended_items, true_items, k):
    """ 
    Calculates precision at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    precision = true_positives / (len(recommended_at_k) if len(recommended_at_k) > 0 else 1)
    return precision

def recall_at_k(recommended_items, true_items, k):
    """ 
    Calculates recall at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    recall = true_positives / (len(true_items) if len(true_items) > 0 else 1)
    return recall

def fscore_at_k(recommended_items, true_items, k):
    """
    Calculates F1-score at k for the recommended items.
    """
    p = precision_at_k(recommended_items, true_items, k)
    r = recall_at_k(recommended_items, true_items, k)

    if p + r == 0:
        return 0.0

    return 2 * p * r / (p + r)

def evaluate_model(recommended_items_all, basket_test, k):
    precisions, recalls, fscores = [], [], []

    for row in basket_test.iter_rows():
        user_id = row[0]
        length = row[1]

        true_items = row[2][:int(np.ceil(0.3 * length))]
        recommended_items = recommended_items_all.get(user_id, [])

        p = precision_at_k(recommended_items, true_items, k)
        r = recall_at_k(recommended_items, true_items, k)
        f = fscore_at_k(recommended_items, true_items, k)

        precisions.append(p)
        recalls.append(r)
        fscores.append(f)

    return np.mean(precisions), np.mean(recalls), np.mean(fscores)


In [5]:
ks = np.linspace(1, 10, 5, dtype=int)  # [1, 3, 5, 7, 10]
CV_results_clusters = {"k": [], "precision": [], "recall": [], "fscore": []}

print(f"Generating cluster-based predictions for {user_baskets.height} users")

for k in ks:
    recommended_items_all = make_prediction_cluster_based(
        baskets_df=user_baskets,
        game_to_cluster=game_to_cluster,
        cluster_to_items=cluster_to_games,
        k=k,
    )

    precision, recall, fscore = evaluate_model(recommended_items_all, user_baskets, k)
    CV_results_clusters["k"].append(k)
    CV_results_clusters["precision"].append(precision)
    CV_results_clusters["recall"].append(recall)
    CV_results_clusters["fscore"].append(fscore)

CV_results_clusters_df = pd.DataFrame(CV_results_clusters)
print(CV_results_clusters_df)


Generating cluster-based predictions for 312352 users
Could not generate 1 recommendations for 10353 users (not enough candidates).
Could not generate 3 recommendations for 10353 users (not enough candidates).
Could not generate 5 recommendations for 10375 users (not enough candidates).
Could not generate 7 recommendations for 10424 users (not enough candidates).
Could not generate 10 recommendations for 10640 users (not enough candidates).
    k  precision    recall    fscore
0   1   0.019372  0.002197  0.003618
1   3   0.007788  0.002642  0.003319
2   5   0.004768  0.002706  0.002803
3   7   0.003446  0.002733  0.002437
4  10   0.002448  0.002774  0.002069
