In [None]:
import polars as pl
import mlxtend.frequent_patterns as fp
import mlxtend.preprocessing as pp
import pickle
import pandas as pd
from efficient_apriori import apriori as apriori_efficient
import numpy as np
import pyarrow as pa
from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from pathlib import Path
import warnings

DATA_DIR = Path("data")
DOWNLOAD_DIR = DATA_DIR / "download"
RAW_DIR = DATA_DIR / "raw"
GEN_DIR = DATA_DIR / "gen"

for d in [DOWNLOAD_DIR, RAW_DIR, GEN_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Loading data

In [None]:
generated_data_path = GEN_DIR

data = pl.read_csv(RAW_DIR / 'bgg-26m-reviews.csv')
print(data.head())

shape: (5, 6)
┌─────┬─────────────┬────────┬─────────────────────────────────┬─────┬───────┐
│     ┆ user        ┆ rating ┆ comment                         ┆ ID  ┆ name  │
│ --- ┆ ---         ┆ ---    ┆ ---                             ┆ --- ┆ ---   │
│ i64 ┆ str         ┆ f64    ┆ str                             ┆ i64 ┆ str   │
╞═════╪═════════════╪════════╪═════════════════════════════════╪═════╪═══════╡
│ 0   ┆ sidehacker  ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 1   ┆ Varthlokkur ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 2   ┆ dougthonus  ┆ 10.0   ┆ Currently, this sits on my lis… ┆ 13  ┆ CATAN │
│ 3   ┆ cypar7      ┆ 10.0   ┆ I know it says how many plays,… ┆ 13  ┆ CATAN │
│ 4   ┆ ssmooth     ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
└─────┴─────────────┴────────┴─────────────────────────────────┴─────┴───────┘


## Filtering

In [3]:
data = data.filter(pl.col('rating') >= 8)
print(data.head())

shape: (5, 6)
┌─────┬─────────────┬────────┬─────────────────────────────────┬─────┬───────┐
│     ┆ user        ┆ rating ┆ comment                         ┆ ID  ┆ name  │
│ --- ┆ ---         ┆ ---    ┆ ---                             ┆ --- ┆ ---   │
│ i64 ┆ str         ┆ f64    ┆ str                             ┆ i64 ┆ str   │
╞═════╪═════════════╪════════╪═════════════════════════════════╪═════╪═══════╡
│ 0   ┆ sidehacker  ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 1   ┆ Varthlokkur ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
│ 2   ┆ dougthonus  ┆ 10.0   ┆ Currently, this sits on my lis… ┆ 13  ┆ CATAN │
│ 3   ┆ cypar7      ┆ 10.0   ┆ I know it says how many plays,… ┆ 13  ┆ CATAN │
│ 4   ┆ ssmooth     ┆ 10.0   ┆ null                            ┆ 13  ┆ CATAN │
└─────┴─────────────┴────────┴─────────────────────────────────┴─────┴───────┘


## Grouping into baskets

In [4]:
baskets = (
    data
    .group_by(['user'])
    .agg(
        pl.col('name').alias('games')
    )
)

baskets_df = baskets.to_pandas()

## Functions

In [5]:
def transform_test_and_train_set(baskets_test_before, baskets_train_before):
    """
    Transforms the test set by splitting the itemsets into test and training parts.
    Adds the training parts to the training set.
    Parameters:
        baskets_test_before (pd.DataFrame): DataFrame with columns 'user' and 'games' for the test set.
        baskets_train_before (pd.DataFrame): DataFrame with columns 'user' and 'games' for the training set.
    Returns:
        baskets_test (pd.DataFrame): Transformed test set with columns 'user', 'games', and 'test'.
        baskets_train (pd.DataFrame): Updated training set with columns 'user' and 'games'.
    """
    
    baskets_test = baskets_test_before[baskets_test_before['games'].apply(len) >= 4]

    baskets_test['test'] = baskets_test['games'].apply(lambda x: x[0:int(np.ceil(0.3*len(x)))])
    baskets_test['games'] = baskets_test['games'].apply(lambda x: x[int(np.ceil(0.3*len(x))):])

    baskets_test_training = baskets_test[['user', 'games']]
    baskets_train = pd.concat([baskets_train_before, baskets_test_training], ignore_index=True)

    return baskets_test, baskets_train

In [6]:
def baskets_df_to_tuples(baskets_df):
    """
    Converts a DataFrame with columns 'user' and 'games' to a list of tuples.
    Parameters:
        baskets_df (pd.DataFrame): DataFrame with columns 'user' and 'games'.
    Returns:
        baskets_tuples (list of tuples): List where each tuple contains the games for a user.
    """
    baskets_tuples = [tuple(row) for row in baskets_df['games']]
    return baskets_tuples

In [7]:
def get_rules_df(baskets_tuples, support, apriori_confidence, rules_confidence):
    """ 
    Generates association rules from the given baskets using the efficient apriori algorithm.
    Parameters:
        baskets_tuples (list of tuples): List where each tuple contains the games for a user.
        support (float): Minimum support threshold.
        apriori_confidence (float): Minimum confidence threshold for the rules from efficient apriori algorithm.
        rules_confidence (float): Minimum confidence threshold for the rules from mlxtend function.
    """
    itemsets, rules = apriori_efficient(baskets_tuples, min_support=support, min_confidence=apriori_confidence)

    len_transactions = len(baskets_tuples)
    itemsets_flattened = []
    itemsets_supports = []
    for i in itemsets.keys():
        itemsets_flattened.extend([item[0] for item in itemsets[i].items()])
        itemsets_supports.extend([item[1]/len_transactions for item in itemsets[i].items()])
    
    itemsets_dict = {
        'support': itemsets_supports,
        'itemsets': itemsets_flattened
    }
    itemsets_df = pd.DataFrame(itemsets_dict)

    rules = fp.association_rules(itemsets_df, metric="confidence", min_threshold=rules_confidence)
    
    return rules

In [8]:
def recommender_association(rules_df, product_list, N=1):
    """ 
    Recommends products based on association rules.
    Parameters:
        rules_df (pd.DataFrame): DataFrame containing association rules with columns 'antecedents' and 'consequents'.
        product_list (set): Set of products already reviewed by the user.
        N (int): Number of recommendations to return.
    """
    candidate_rules = rules_df[rules_df['antecedents'].apply(lambda x: x.issubset(product_list))]
    candidate_rules = candidate_rules.sort_values("confidence")
    recommendation_list = []
    for i in range(len(candidate_rules)):
        for item in candidate_rules.iloc[i]['consequents']:
            if item not in product_list and item not in recommendation_list:
                recommendation_list.append(item)
            if len(recommendation_list) >= N:
                break
        if len(recommendation_list) >= N:
            break

    return recommendation_list

In [9]:
def process_row(row, rules_df):
    user = row['user']
    train_items = row['games']
    test_items = row['test']
    recommended_items = recommender_association(rules_df, train_items, N=10)
    return user, recommended_items, test_items

def get_predictions_from_rules(rules_df, baskets_test):
    """ 
    Generates predictions for the test baskets using the provided association rules.
    Parameters:
        rules_df (pd.DataFrame): DataFrame containing association rules.
        baskets_test (pd.DataFrame): DataFrame with columns 'user', 'games', and 'test'.
    """
    predictions = {
        'user': [],
        'recommended_items': [],
        'true_items': []
    }

    results = Parallel(n_jobs=-1, backend='loky')(
        delayed(lambda row: process_row(row, rules_df))(row) for _, row in baskets_test.iterrows()
    )   

    for user, recommended_items, true_items in results:
        predictions['user'].append(user)
        predictions['recommended_items'].append(recommended_items)
        predictions['true_items'].append(true_items)
        
    predictions_df = pd.DataFrame(predictions)
    return predictions_df

In [10]:
def precision_at_k(recommended_items, true_items, k):
    """ 
    Calculates precision at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    precision = true_positives / (len(recommended_at_k) if len(recommended_at_k) > 0 else 1)
    return precision

def recall_at_k(recommended_items, true_items, k):
    """ 
    Calculates recall at k for the recommended items.
    Parameters:
        recommended_items (list): List of recommended items.
        true_items (list): List of true items.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    recommended_at_k = recommended_items if len(recommended_items) < k else recommended_items[:k]
    true_positives = len(set(recommended_at_k) & set(true_items))
    recall = true_positives / (len(true_items) if len(true_items) > 0 else 1)
    return recall

def fscore_at_k(recommended_items, true_items, k):
    """
    Calculates F1-score at k for the recommended items.
    """
    p = precision_at_k(recommended_items, true_items, k)
    r = recall_at_k(recommended_items, true_items, k)

    if p + r == 0:
        return 0.0

    return 2 * p * r / (p + r)

In [11]:
def evaluate_model(predictions_df, k):
    """ 
    Evaluates the model's predictions using precision and recall at k.
    Parameters: 
        predictions_df (pd.DataFrame): DataFrame from get_predictions_from_rules() function.
        k (int): The cutoff rank (number of recommended items to consider).
    """
    precisions = []
    recalls = []
    fscores = []
    for i in range(len(predictions_df)):
        precisions.append(precision_at_k(predictions_df['recommended_items'][i], predictions_df['true_items'][i], k=k))
        recalls.append(recall_at_k(predictions_df['recommended_items'][i], predictions_df['true_items'][i], k=k))
        fscores.append(fscore_at_k(predictions_df['recommended_items'][i], predictions_df['true_items'][i], k=k))
    return np.mean(precisions), np.mean(recalls), np.mean(fscores)

## Cross-validation

Evaluation metrics for the first 5 folds of 20-fold cross-validation are calculated. The calculation is stopped after the first 5 folds in order to reduce computation time.

In [None]:
baskets_df = baskets.to_pandas()
K=20
folds = KFold(n_splits=K, shuffle=True, random_state=42)
support = 0.005
confidence = 0.6
ks = np.linspace(1, 10, 5, dtype=int)

CV_results = {
    'support': [],
    'confidence': [],
    'k': [],
    'precision': [],
    'recall': [],
    'fscore': []
}

fold_id = 0

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    
    for train_index, test_index in folds.split(baskets_df):
        baskets_train = baskets_df.iloc[train_index]
        baskets_test = baskets_df.iloc[test_index]
        baskets_test, baskets_train = transform_test_and_train_set(baskets_test, baskets_train)

        baskets_train_as_tuples = baskets_df_to_tuples(baskets_train)

        rules_df = get_rules_df(baskets_train_as_tuples, support, apriori_confidence=confidence, rules_confidence=confidence)
        rules_df.to_csv(generated_data_path / f"rules_{support}_{confidence}_fold{fold_id + 1}.csv", index=True)
        
        predictions_df = get_predictions_from_rules(rules_df, baskets_test)
        for k in ks:
            precision, recall, fscore = evaluate_model(predictions_df, k)
            print(f"Support: {support}, Confidence: {confidence}, K: {k} => Precision: {precision}, Recall: {recall}, F-score: {fscore}")
            CV_results['support'].append(support)
            CV_results['confidence'].append(confidence)
            CV_results['k'].append(k)
            CV_results['precision'].append(precision)
            CV_results['recall'].append(recall)
            CV_results['fscore'].append(fscore)
        fold_id += 1
        if fold_id == 5:
            #Check only first 5 folds
            break
CV_results_df = pd.DataFrame(CV_results)
CV_results_df

Support: 0.005, Confidence: 0.6, K: 1 => Precision: 0.20889566952235653, Recall: 0.027654698107067777, F-score: 0.04524598853921098
Support: 0.005, Confidence: 0.6, K: 3 => Precision: 0.2147928646872433, Recall: 0.053290859826869846, F-score: 0.07661585162155528
Support: 0.005, Confidence: 0.6, K: 5 => Precision: 0.2153669365880374, Recall: 0.0653405455499887, F-score: 0.08831325031387428
Support: 0.005, Confidence: 0.6, K: 7 => Precision: 0.2153605098887343, Recall: 0.07082674851998205, F-score: 0.09299966515653015
Support: 0.005, Confidence: 0.6, K: 10 => Precision: 0.2153098414188662, Recall: 0.07365356410395528, F-score: 0.09518495605409459
Support: 0.005, Confidence: 0.6, K: 1 => Precision: 0.2168374903726524, Recall: 0.027932751979023677, F-score: 0.045976924560493314
Support: 0.005, Confidence: 0.6, K: 3 => Precision: 0.21982937377806738, Recall: 0.05376115077368232, F-score: 0.07746464982135684
Support: 0.005, Confidence: 0.6, K: 5 => Precision: 0.219084858897644, Recall: 0.066

Unnamed: 0,support,confidence,k,precision,recall,fscore
0,0.005,0.6,1,0.208896,0.027655,0.045246
1,0.005,0.6,3,0.214793,0.053291,0.076616
2,0.005,0.6,5,0.215367,0.065341,0.088313
3,0.005,0.6,7,0.215361,0.070827,0.093
4,0.005,0.6,10,0.21531,0.073654,0.095185
5,0.005,0.6,1,0.216837,0.027933,0.045977
6,0.005,0.6,3,0.219829,0.053761,0.077465
7,0.005,0.6,5,0.219085,0.066011,0.089105
8,0.005,0.6,7,0.218774,0.071574,0.093827
9,0.005,0.6,10,0.218527,0.074137,0.095725


In [None]:
# Save evaluation results
CV_results_df.to_csv(generated_data_path / "association_rules_CV_results.csv", index=False)