In [1]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

DATA = Path("./data")

GEN_DATA = DATA / "gen"
RAW_DATA = DATA / "raw"
game_data = pl.read_csv(RAW_DATA / "games_detailed_info2025.csv")

game_data = game_data.with_columns(pl.col("description").fill_null(""))
# game_data = game_data.sample(fraction=1)

game_data.head()

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # faster


data_path = Path("data")
review_data = pl.read_csv(data_path / "bgg-26m-reviews.csv")

review_data = review_data.filter(pl.col("rating") >= 8)
user_data = review_data.group_by("user").agg(
    pl.len().alias("len"),
    pl.col("ID"),
)
# keep only users with at least 5 reviews
print(f"Number of users: {user_data.height}")
user_data = user_data.filter(pl.col("len") > 1)
print(f"Number of users with at least 5 high ratings: {user_data.height}")


def spacy_preprocessing(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.is_alpha and not token.is_stop])


# vectorizer = TfidfVectorizer(analyzer="word", lowercase=True, strip_accents="unicode", max_features=1024, min_df=1, max_df=0.8, token_pattern=r"[A-Za-z]+", stop_words="english")
vectorizer = TfidfVectorizer(analyzer="word", lowercase=True, strip_accents="unicode", max_features=1024, min_df=1, max_df=0.8, token_pattern=r"[A-Za-z]+", preprocessor=spacy_preprocessing, stop_words=None)

X = vectorizer.fit_transform(game_data["description"])

reduced_X = X.toarray()

print()

index_to_id = {index: id for index, id in enumerate(game_data["id"])}
id_to_index = {v: k for k, v in index_to_id.items()}

# Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=11, metric="cosine").fit(reduced_X)

Number of users: 536829
Number of users with at least 5 high ratings: 418829



In [2]:
def compute_basket_embeddings_train_only(baskets, item_vectors, id_map):
    user_vectors = []
    valid_user_indices = []

    valid_ids_set = set(id_map.keys())
    print(f"Processing {len(baskets)} user baskets...")

    for i, basket in enumerate(baskets):
        length = len(basket)
        if length == 0:
            continue

        # Split: 30% test, 70% train
        split_point = int(np.ceil(0.3 * length))
        train_items = basket[split_point:]  # remaining 70% for centroid

        # Only keep train items that exist in id_map
        train_valid_indices = [id_map[item] for item in train_items if item in valid_ids_set]

        if train_valid_indices:
            # Compute centroid from train items only
            train_vecs = item_vectors[train_valid_indices]
            user_vec = np.mean(train_vecs, axis=0)

            user_vectors.append(user_vec)
            valid_user_indices.append(i)

    return np.array(user_vectors), valid_user_indices


user_baskets = user_data["ID"].to_list()

Y, valid_user_idxs = compute_basket_embeddings_train_only(user_baskets, reduced_X, id_to_index)

print(f"Generated embeddings for {len(Y)} users (dropped {len(user_baskets) - len(Y)} users with no valid items).")

distances, indices = nbrs.kneighbors(Y, n_neighbors=11)


Processing 418829 user baskets...
Generated embeddings for 418809 users (dropped 20 users with no valid items).


In [6]:
def precision_at_k(recommended_items, true_items, k):
    """ 
    Calculates precision at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    precision = true_positives / (len(recommended_at_k) if len(recommended_at_k) > 0 else 1)
    return precision

def recall_at_k(recommended_items, true_items, k):
    """ 
    Calculates recall at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    recall = true_positives / (len(true_items) if len(true_items) > 0 else 1)
    return recall

def fscore_at_k(recommended_items, true_items, k):
    """
    Calculates F1-score at k for the recommended items.
    """
    p = precision_at_k(recommended_items, true_items, k)
    r = recall_at_k(recommended_items, true_items, k)

    if p + r == 0:
        return 0.0

    return 2 * p * r / (p + r)

def evaluate_model(recommended_items_all, basket_test, k):
    precisions, recalls, fscores = [], [], []

    for row in basket_test.iter_rows():
        user_id = row[0]
        length = row[1]

        true_items = row[2][:int(np.ceil(0.3 * length))]
        recommended_items = recommended_items_all.get(user_id, [])

        p = precision_at_k(recommended_items, true_items, k)
        r = recall_at_k(recommended_items, true_items, k)
        f = fscore_at_k(recommended_items, true_items, k)

        precisions.append(p)
        recalls.append(r)
        fscores.append(f)

    return np.mean(precisions), np.mean(recalls), np.mean(fscores)


In [4]:
import numpy as np


def make_prediction_using_embeddings(baskets_df, valid_user_idxs, nn_indices, nn_distances, index_to_id, k=10):
    """
    Args:
        baskets_df: The polars dataframe containing User and Games list.
        valid_user_idxs: The list returned by compute_basket_embeddings_robust.
                         Maps the index in Y/nn_indices back to the row index in baskets_df.
        nn_indices: The indices matrix from nbrs.kneighbors.
        nn_distances: The distances matrix from nbrs.kneighbors.
        index_to_id: Dict mapping matrix index -> Game ID.
        k: Number of recommendations to return.
    """
    recommended_items_all = {}
    skip_counter = 0
    too_short_counter = 0

    row_to_array_idx = {row_idx: i for i, row_idx in enumerate(valid_user_idxs)}

    for row_idx, row in enumerate(baskets_df.iter_rows()):
        user_id = row[0]
        all_games = row[2]

        if row_idx not in row_to_array_idx:
            skip_counter += 1
            continue

        length = len(all_games)
        future = int(np.ceil(0.3 * length))
        history = set(all_games[future:])

        array_pos = row_to_array_idx[row_idx]

        neighbor_matrix_indices = nn_indices[array_pos]


        # recommendations are already sorted by distance, select top k not in history NOTE: it can happen that run out of candidates and end up with less than k recommendations
        top_k = []
        for mat_idx in neighbor_matrix_indices:

            if mat_idx not in index_to_id:
                continue

            game_id = index_to_id[mat_idx]

            if game_id not in history:
                top_k.append(game_id)

            if len(top_k) >= k:
                break
        if len(top_k) != k:
            too_short_counter += 1
        recommended_items_all[user_id] = top_k
    if skip_counter > 0:
        print(f"Skipped {skip_counter} users (no valid embedding found).")
    if too_short_counter > 0:
        print(f"Could not generate {k} recommendations for {too_short_counter} users (not enough candidates).")
    return recommended_items_all

In [7]:

print(f"Generating predictions for {len(valid_user_idxs)} users...")

ks = np.linspace(1, 10, 5, dtype=int)
# ks = [3, 5, 7, 10]
CV_results = {"k": [], "precision": [], "recall": [], "fscore": []}

for k in ks:
    recommended_items_all = make_prediction_using_embeddings(baskets_df=user_data, valid_user_idxs=valid_user_idxs, nn_indices=indices, nn_distances=distances, index_to_id=index_to_id, k=k)

    precision, recall, fscores = evaluate_model(recommended_items_all, user_data, k)
    CV_results["k"].append(k)
    CV_results["precision"].append(precision)
    CV_results["recall"].append(recall)
    CV_results["fscore"].append(fscores)

CV_results_df = pd.DataFrame(CV_results)
print(CV_results_df)

Generating predictions for 418809 users...
Skipped 20 users (no valid embedding found).
Could not generate 1 recommendations for 48 users (not enough candidates).
Skipped 20 users (no valid embedding found).
Could not generate 3 recommendations for 3561 users (not enough candidates).
Skipped 20 users (no valid embedding found).
Could not generate 5 recommendations for 39408 users (not enough candidates).
Skipped 20 users (no valid embedding found).
Could not generate 7 recommendations for 110692 users (not enough candidates).
Skipped 20 users (no valid embedding found).
Could not generate 10 recommendations for 316709 users (not enough candidates).
    k  precision    recall    fscore
0   1   0.013726  0.004534  0.005740
1   3   0.010729  0.008357  0.007248
2   5   0.009271  0.010565  0.007448
3   7   0.008538  0.011954  0.007435
4  10   0.008183  0.013080  0.007502
