In [23]:
from pathlib import Path
import pandas as pd
import polars as pl
from sklearn.model_selection import KFold
import numpy as np
import warnings

In [41]:
data_path = Path('data')

data = pl.read_csv(data_path / 'bgg-26m-reviews.csv')
data = data.filter(pl.col('rating') >= 8)
data = (
    data
    .group_by(['user'])
    .agg(
        pl.col('name').alias('games')
    )
)

data = data.to_pandas()

In [51]:
def precision_at_k(recommended_items, true_items, k):
    """ 
    Calculates precision at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    precision = true_positives / (len(recommended_at_k) if len(recommended_at_k) > 0 else 1)
    return precision

def recall_at_k(recommended_items, true_items, k):
    """ 
    Calculates recall at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    recall = true_positives / (len(true_items) if len(true_items) > 0 else 1)
    return recall

In [56]:
def evaluate_model(recommended_items, basket_test, k):
    """ 
    Evaluates the model's predictions using precision and recall at k.
    Parameters: 
        predictions_df (pd.DataFrame): DataFrame from get_predictions_from_rules() function.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    precisions = []
    recalls = []
    for basket in basket_test['games']:
        precisions.append(precision_at_k(recommended_items, basket, k=k))
        recalls.append(recall_at_k(recommended_items, basket, k=k))

    return np.mean(precisions), np.mean(recalls)

In [None]:
K=5
folds = KFold(n_splits=K, shuffle=True, random_state=42)
ks = np.linspace(1, 10, 5, dtype=int)

CV_results = {
    'k': [],
    'precision': [],
    'recall': []
}

fold_id = 1

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    for train_index, test_index in folds.split(data):
        baskets_train = data.iloc[train_index]
        baskets_test = data.iloc[test_index]
        item_counts = baskets_train.explode('games')['games'].value_counts().sort_values(ascending=False)
        top_items = item_counts.index.tolist()
        
        for k in ks:
            recommended_items = top_items[:k]
            precision, recall = evaluate_model(recommended_items, baskets_test, k)
            print(f"K: {k} => Precision: {precision}, Recall: {recall}")
            CV_results['k'].append(k)
            CV_results['precision'].append(precision)
            CV_results['recall'].append(recall)
        fold_id += 1

CV_results_df = pd.DataFrame(CV_results)
CV_results_df

K: 1 => Precision: 0.14860384106700444, Recall: 0.012580725707107894
K: 3 => Precision: 0.13571335432073467, Recall: 0.031829449752587675
K: 5 => Precision: 0.1284820147905296, Recall: 0.04944815800969068
K: 7 => Precision: 0.12272839765714603, Recall: 0.06646919072508317
K: 10 => Precision: 0.11444591397649163, Recall: 0.09029740987327449
K: 1 => Precision: 0.14858521319598383, Recall: 0.012374741727580728
K: 3 => Precision: 0.13624735328999252, Recall: 0.03166338949208312
K: 5 => Precision: 0.12838887543542646, Recall: 0.049319925267588
K: 7 => Precision: 0.12249155758274101, Recall: 0.06626594388588661
K: 10 => Precision: 0.11466199728033084, Recall: 0.09046505706919288
K: 1 => Precision: 0.14649889164167426, Recall: 0.011936784662065648
K: 3 => Precision: 0.13400579947717772, Recall: 0.030676716973223103
K: 5 => Precision: 0.12665462064340668, Recall: 0.048245209771152237
K: 7 => Precision: 0.12102527802097497, Recall: 0.06478396652002764
K: 10 => Precision: 0.11345118566399047, Re

Unnamed: 0,k,precision,recall
0,1,0.148604,0.012581
1,3,0.135713,0.031829
2,5,0.128482,0.049448
3,7,0.122728,0.066469
4,10,0.114446,0.090297
5,1,0.148585,0.012375
6,3,0.136247,0.031663
7,5,0.128389,0.04932
8,7,0.122492,0.066266
9,10,0.114662,0.090465
