In [61]:
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

DATA = Path("./data")

GEN_DATA = DATA / "gen"
RAW_DATA = DATA / "raw"
df = pl.read_csv(RAW_DATA / "games_detailed_info2025.csv")
# drop rows with null descriptions
df = df.with_columns(pl.col("description").fill_null(""))
df = df.sample(fraction=1)
df.head()

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # faster


data_path = Path("data")
data = pl.read_csv(data_path / "bgg-26m-reviews.csv")

data = data.filter(pl.col("rating") >= 8)
data = data.group_by("user").agg(
    # 1. Aggregate the "ID" column into a list
    pl.len(),
    pl.col("ID"),
)


def spacy_preprocessing(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.is_alpha and not token.is_stop])


# vectorizer = TfidfVectorizer(analyzer="word", lowercase=True, strip_accents="unicode", max_features=1024, min_df=1, max_df=0.8, token_pattern=r"[A-Za-z]+", stop_words="english")
vectorizer = TfidfVectorizer(analyzer="word", lowercase=True, strip_accents="unicode", max_features=1024, min_df=1, max_df=0.8, token_pattern=r"[A-Za-z]+", preprocessor=spacy_preprocessing, stop_words=None)

X = vectorizer.fit_transform(df["description"])

reduced_X = X.toarray()

index_to_id = {index: id for index, id in enumerate(df["id"])}
id_to_index = {v: k for k, v in index_to_id.items()}

# Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=11, metric="cosine").fit(reduced_X)
distances, indices = nbrs.kneighbors(reduced_X)


In [66]:
def precision_at_k(recommended_items, true_items, k):
    recommended_at_k = recommended_items[-k:]
    true_set = set(true_items[-k:])

    true_positives = len(set(recommended_at_k) & true_set)
    return true_positives / k


def recall_at_k(recommended_items, true_items, k):
    # same as precision_at_k
    return precision_at_k(recommended_items, true_items, k)


def evaluate_model(recommended_items_all, basket_test, k):
    precisions, recalls = [], []
    for row in basket_test.iter_rows():
        user_id = row[0]
        true_items = row[2]
        recommended_items = recommended_items_all.get(user_id, [])
        precisions.append(precision_at_k(recommended_items, true_items, k))
        recalls.append(recall_at_k(recommended_items, true_items, k))
    return np.mean(precisions), np.mean(recalls)

In [67]:
import heapq

def make_prediction(baskets, k):
    recommended_items_all = {}
    skip_counter = 0
    for row in baskets.iter_rows():
        user_id = row[0]
        length = row[1]
        if length < k:
            skip_counter += 1
            continue
        
        history = row[2][: length - k]

        k_heap = []

        for item in history:
            if item not in id_to_index:
                continue
            idx = id_to_index[item]
            neighbors = indices[idx][1 : k + 1]
            scores = [1 + d for d in distances[idx][1 : k + 1]]

            for neighbor, score in zip(neighbors, scores):
                if len(k_heap) < k:
                    heapq.heappush(k_heap, (score, neighbor))
                else:
                    if score > k_heap[0][0]:
                        heapq.heapreplace(k_heap, (score, neighbor))

        top_k = [item for score, item in sorted(k_heap, reverse=True)]
        
        recommended_items_all[user_id] = top_k
    print(f"Skipped {skip_counter} users with basket size < {k}")
    return recommended_items_all

In [68]:
print(f"Processing { data.shape[0]} users' baskets")
ks = np.linspace(1, 10, 5, dtype=int)

CV_results = {"k": [], "precision": [], "recall": []}

for k in ks:
    recommended_items_all = make_prediction(data, k)

    precision, recall = evaluate_model(recommended_items_all, data, k)
    CV_results["k"].append(k)
    CV_results["precision"].append(precision)
    CV_results["recall"].append(recall)

CV_results_df = pd.DataFrame(CV_results)
print(CV_results_df)

Processing 536829 users' baskets
Skipped 0 users with basket size < 1
Skipped 165027 users with basket size < 3
Skipped 224477 users with basket size < 5
Skipped 266006 users with basket size < 7
Skipped 311645 users with basket size < 10
    k  precision    recall
0   1   0.000007  0.000007
1   3   0.000014  0.000014
2   5   0.000037  0.000037
3   7   0.000038  0.000038
4  10   0.000032  0.000032
