In [3]:
!pip install lenskit 

Collecting lenskit
  Downloading lenskit-0.14.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.0/74.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.*,>=1.0 (from lenskit)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting numba<0.57,>=0.51 (from lenskit)
  Downloading numba-0.56.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting binpickle>=0.3.2 (from lenskit)
  Downloading binpickle-0.3.4-py3-none-any.whl (13 kB)
Collecting seedbank>=0.1.0 (from lenskit)
  Downloading seedbank-0.1.3-py3-none-any.whl (8.5 kB)
Collecting csr>=0.3.1 (from lenskit)
  Downloading csr-0.5.

In [78]:
import pandas as pd

from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
from lenskit.algorithms.item_knn import ItemItem

from sklearn.model_selection import train_test_split, KFold

import numpy as np
import statistics

import pickle

In [80]:
data_folder = '../data/'

item_ratings_df = pd.read_csv(data_folder + 'user_ratings.csv')
# item_ratings_df = pd.read_csv(data_folder + 'user_ratings_600k.csv')
# item_ratings_df = pd.read_csv(data_folder + 'user_ratings_1.8m.csv')
item_info = pd.read_csv(data_folder + 'games.csv')
item_mecahnics = pd.read_csv(data_folder + 'mechanics.csv')
item_subcategories = pd.read_csv(data_folder + 'subcategories.csv')
item_themes = pd.read_csv(data_folder + 'themes.csv')

In [81]:
# item_ratings_df = item_ratings_df.groupby('Username', group_keys=False).apply(lambda x: x.sample(frac=0.1))

In [82]:
item_ratings_df = item_ratings_df.rename(columns={
    'BGGId':'item',
    'Rating':'rating',
    'Username':'user'
})

In [83]:
# item_ratings_df_filtered = item_ratings_df_filtered.drop_duplicates(subset = ['item', 'user'])

In [84]:
user_rating_count = item_ratings_df.groupby('user').count()
user_rating_count

Unnamed: 0_level_0,item,rating
user,Unnamed: 1_level_1,Unnamed: 2_level_1
Fu_Koios,2,2
beastvol,9,9
mycroft,14,14
woh,5,5
(mostly) harmless,1,1
...,...,...
zzzuzu,39,39
zzzvone,21,21
zzzxxxyyy,36,36
zzzzzane,154,154


In [85]:
min_rating_count = 10
user_rating_count = user_rating_count[user_rating_count['rating'] >= min_rating_count]

In [86]:
user_list_filtered = list(user_rating_count.index)

In [87]:
item_ratings_df_filtered = item_ratings_df[item_ratings_df['user'].isin(user_list_filtered)]

In [88]:
item_ratings_df_filtered

Unnamed: 0,item,rating,user
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP
...,...,...,...
18942210,165521,3.0,rseater
18942211,165521,3.0,Bluefox86
18942212,165521,3.0,serginator
18942213,193488,1.0,CaptainCattan


### Build

In [60]:
num_recs = 10  # Number of recommendations to generate
item_item = ItemItem(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys_itemitem = Recommender.adapt(item_item)

In [61]:
recsys_itemitem.fit(item_ratings_df_filtered)

<lenskit.algorithms.ranking.TopN at 0x7cb5596e7e20>

In [62]:
 recsys_itemitem.recommend('zzzzzane', 10)

Unnamed: 0,item,score
0,324856,9.89267
1,275777,9.795043
2,342942,9.745515
3,249277,9.656002
4,345976,9.612687
5,341169,9.597214
6,329082,9.491935
7,323046,9.38712
8,313776,9.334337
9,165694,9.298533


In [14]:
recsys_itemitem.predict_for_user('zzzzzane', [145888, 226522])

item
145888    6.526580
226522    7.780456
dtype: float64

### Test

In [63]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [64]:
kf = KFold(n_splits=5, shuffle=True)

In [65]:
def predict(model, test_dataframe):
    real_ratings = []
    predicted_ratings = []
    
    users = test_dataframe['user'].unique()
    
    for user in users:
        current_frame = test_dataframe[test_dataframe['user'] == user]
        
        items = list(current_frame['item'])
        ratings = list(current_frame['rating'])
        predictions = list(model.predict_for_user(user, items))
        
        real_ratings += ratings
        predicted_ratings += predictions
    
    return real_ratings, predicted_ratings

In [67]:
mae_list = list()
rmse_list = list()
nans_list = list()
non_nans_list = list()

count = 1

for train_index, test_index in kf.split(item_ratings_df_filtered):
    print(f'Doing fold no: {count}')
    count += 1
    
    train_df = item_ratings_df_filtered.iloc[train_index]
    test_df = item_ratings_df_filtered.iloc[test_index]
    test_df = test_df.sample(frac = 0.001)
    
    train_df = train_df.drop_duplicates(subset = ['item', 'user'])
    test_df = test_df.drop_duplicates(subset = ['item', 'user'])
    
    print('Dataframes created')
    
    item_item = ItemItem(10, min_nbrs=3)
    recsys = Recommender.adapt(item_item)
    recsys.fit(train_df)
    print('Model is fitted')
    
    real_ratings, predicted_ratings = predict(recsys, test_df)
    
    y_test = real_ratings
    y_pred = predicted_ratings
    
    y_test_not_nan = list()
    y_pred_not_nan = list()
    
    nans = 0
    non_nans = 0
    
    for i in range(len(y_pred)):
        num = y_pred[i]
        
        if np.isnan(num):
            nans += 1
        else:
            non_nans += 1
            y_test_not_nan.append(y_test[i])
            y_pred_not_nan.append(num)
            
    print(f'nan values: {nans}')
    print(f'non-nan values: {non_nans}')
    
    mae = mean_absolute_error(y_test_not_nan, y_pred_not_nan)
    rmse = mean_squared_error(y_test_not_nan, y_pred_not_nan, squared = False)
    
    mae_list.append(mae)
    rmse_list.append(rmse)
    nans_list.append(nans)
    non_nans_list.append(non_nans)
    
# Compute average results and standard deviations
results = {
    'avg_mae' : statistics.mean(mae_list),
    'stdv_mae' : statistics.stdev(mae_list),
    'avg_rmse' : statistics.mean(rmse_list),
    'stdv_rmse' : statistics.stdev(rmse_list),
    'avg_nans': statistics.mean(nans_list),
    'avg_non_nans' : statistics.mean(non_nans_list)
}

display(results)

Doing fold no: 1
Dataframes created
Model is fitted
nan values: 1
non-nan values: 3674
Doing fold no: 2
Dataframes created
Model is fitted
nan values: 0
non-nan values: 3675
Doing fold no: 3
Dataframes created
Model is fitted
nan values: 0
non-nan values: 3675
Doing fold no: 4
Dataframes created
Model is fitted
nan values: 0
non-nan values: 3675
Doing fold no: 5
Dataframes created
Model is fitted
nan values: 0
non-nan values: 3675


{'avg_mae': 0.849650260667979,
 'stdv_mae': 0.005147369534018413,
 'avg_rmse': 1.139501925417811,
 'stdv_rmse': 0.013279363421001696,
 'avg_nans': 0.2,
 'avg_non_nans': 3674.8}

{'avg_mae': 0.849650260667979,\
 'stdv_mae': 0.005147369534018413,\
 'avg_rmse': 1.139501925417811,\
 'stdv_rmse': 0.013279363421001696,\
 'avg_nans': 0.2,\
 'avg_non_nans': 3674.8}

### Build explainable Item-based CF

In [48]:
id_to_game = item_info.set_index('BGGId')['Name'].to_dict()

In [45]:
# data = {
#     'item': [101, 102, 103, 104, 105],  # Repeating items for demonstration
#     'rating': [4, 5, 3, 4, 5],  # Repeating ratings for demonstration
#     'user': [201, 202, 203, 204, 205]  # Repeating users for demonstration
# }

# # Create a DataFrame
# fake_dataset = pd.DataFrame(data)

In [184]:
item_ratings_df_filtered = item_ratings_df_filtered.drop_duplicates(subset=['user', 'item'], keep=False)
item_ratings_df_filtered

Unnamed: 0,item,rating,user
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP
...,...,...,...
18942210,165521,3.0,rseater
18942211,165521,3.0,Bluefox86
18942212,165521,3.0,serginator
18942213,193488,1.0,CaptainCattan


In [12]:
# item_ratings_df_filtered = item_ratings_df_filtered.sample(frac=0.01)

In [185]:
users = item_ratings_df_filtered['user'].unique()

In [186]:
item_item = ItemItem(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys_itemitem = Recommender.adapt(item_item)

In [187]:
recsys_itemitem.fit(item_ratings_df_filtered)

<lenskit.algorithms.ranking.TopN at 0x2b49e551370>

In [188]:
sim_matrix = item_item.sim_matrix_
sim_matrix

<CSR 21925x21925 (198492304 nnz) {
  rowptrs=[        0     16805     30355 ... 198483357 198492081 198492304]
  colinds=[ 8823  8425  4706 ... 14553 15578 13319]
  values=[1.15406021e-01 1.08755551e-01 1.07919488e-01 ... 1.24506889e-05
 3.84494737e-06 1.55585027e-06]
  dtype=float64
}>

In [189]:
sim_matrix = sim_matrix.to_scipy()

In [190]:
sim_matrix = sim_matrix.todense()

In [191]:
sim_matrix = np.array(sim_matrix)

In [192]:
item_index = item_item.item_index_

In [98]:
def get_most_similar_items(matrix, item, k, indexer):
    
    if np.isin(item, indexer):
        location = indexer.get_loc(item)
    
        arr = matrix[location]
        
        indices_of_k_largest = np.argsort(arr)[-k:][::-1]
        values_of_k_largest = arr[indices_of_k_largest]

        return indexer[indices_of_k_largest].tolist(), list(values_of_k_largest)
    else:
        return None

In [99]:
a = get_most_similar_items(sim_matrix, 50381, 2, item_index)
a

([1927, 74], [0.1355229588331795, 0.12824950345992966])

In [104]:
def recommend_for_user(user_item_df, matrix, k, indexer, user_id, threshold):
    current_df = user_item_df[user_item_df['user'] == user_id]
    current_df = current_df.sort_values('rating', ascending = False)
    
    for i in range(current_df.shape[0]):
        current_item = current_df.iloc[i]['item']
        rating = current_df.iloc[i]['rating']
        if rating >= threshold:
            recommendation = get_most_similar_items(matrix, current_item, k, indexer)
            return f"Players Who Liked \'{id_to_game[current_item]}\' Also Tried", current_item, recommendation
    
    return None

In [105]:
user_id = 'PPMP'
threshold = 8

a = recommend_for_user(item_ratings_df_filtered, sim_matrix, 5, item_index, user_id, threshold)
a

("Players Who Liked 'CATAN 3D Collector's Edition' Also Tried",
 17419,
 ([3972, 60153, 2338, 1897, 189],
  [0.13818928128899272,
   0.10571859457706469,
   0.08840461511110898,
   0.0830674858661084,
   0.08150283601981548]))

### Grouped item-based CF

In [193]:
item_ratings_df_filtered

Unnamed: 0,item,rating,user
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP
...,...,...,...
18942210,165521,3.0,rseater
18942211,165521,3.0,Bluefox86
18942212,165521,3.0,serginator
18942213,193488,1.0,CaptainCattan


In [79]:
filename = 'clusters_ids.npy'
clusters_ids = np.load(filename, allow_pickle=True)

\# of users that liked an item -> predicted item ratings (No misery + Additive) / (No misery + Most Pleasure)

In [280]:
# min_rating_threshold = 8

# current_group = clusters_ids[0]
# current_clister_ratings = item_ratings_df_filtered[item_ratings_df_filtered['user'].isin(current_group)]
# current_clister_ratings = current_clister_ratings[current_clister_ratings['rating'] >= min_rating_threshold]

In [281]:
# current_agg = current_clister_ratings.groupby('item').agg(
#     count_similar_games = ('item','count')
# ).reset_index()
# current_agg

In [282]:
# counts_games = current_agg['count_similar_games'].unique()
# counts_games[::-1].sort()
# counts_games

In [283]:
# list(current_agg[current_agg['count_similar_games'] == 4]['item'])

In [284]:
# k = 5

# for current_count in counts_games:
#     items = list(current_agg[current_agg['count_similar_games'] == current_count]['item'])
#     all_parent_users = []
#     all_prediction_items = []
#     all_prediction_ratings = []
#     all_items = []
    
#     for item in items:
#         similar_items = get_most_similar_items(sim_matrix, item, k, item_index)[0]
#         parent_users = list(current_clister_ratings[current_clister_ratings['item'] == item]['user'])
#         parent_users = np.intersect1d(parent_users, current_group)
        
#         current_predictions = []        
#         for user_id in current_group:
#             current_predictions.append(recsys_itemitem.predict_for_user(user_id, similar_items))
        
#         temp_rating = []
#         for pred in current_predictions:
#             temp_rating.append(list(pred.values))
            
#         all_prediction_items.append(similar_items)
#         all_prediction_ratings.append(temp_rating)
#         all_items.append(item)
#         all_parent_users.append(list(parent_users))
        
#     final_recommendations = []
#     lower_threshold = 5

#     for predicted_items_set, predicted_ratings_set, parent_item, parent_users\
#     in zip(all_prediction_items, all_prediction_ratings, all_items, all_parent_users):
#         transposed_ratings = np.array(predicted_ratings_set).T.tolist()
#         for rating_set, it in zip(transposed_ratings, predicted_items_set):
#             count = 0
#             for r in rating_set:
#                 if r >= lower_threshold:
#                     count += 1
#             if count == len(rating_set):
#                 # additive
# #                 final_recommendations.append((it, np.sum(rating_set), parent_item, parent_users))
#                 # most pleasure
#                 final_recommendations.append((it, np.max(rating_set), parent_item, parent_users))
                
#     if len(final_recommendations) > 0:
#         break

In [285]:
# final_recommendations = sorted(final_recommendations, key=lambda x: x[1], reverse=True)
# final_recommendations = final_recommendations[:5]
# final_recommendations

In [278]:
k = 5
min_rating_threshold = 8
all_final_recommendations = []

for current_group in clusters_ids:

    current_clister_ratings = item_ratings_df_filtered[item_ratings_df_filtered['user'].isin(current_group)]
    current_clister_ratings = current_clister_ratings[current_clister_ratings['rating'] >= min_rating_threshold]
    
    current_agg = current_clister_ratings.groupby('item').agg(
        count_similar_games = ('item','count')
    ).reset_index()
    
    counts_games = current_agg['count_similar_games'].unique()
    counts_games[::-1].sort()
    
    for current_count in counts_games:
        items = list(current_agg[current_agg['count_similar_games'] == current_count]['item'])
        all_parent_users = []
        all_prediction_items = []
        all_prediction_ratings = []
        all_items = []

        for item in items:
            similar_items = get_most_similar_items(sim_matrix, item, k, item_index)[0]
            parent_users = list(current_clister_ratings[current_clister_ratings['item'] == item]['user'])
            parent_users = np.intersect1d(parent_users, current_group)

            current_predictions = []        
            for user_id in current_group:
                current_predictions.append(recsys_itemitem.predict_for_user(user_id, similar_items))

            temp_rating = []
            for pred in current_predictions:
                temp_rating.append(list(pred.values))

            all_prediction_items.append(similar_items)
            all_prediction_ratings.append(temp_rating)
            all_items.append(item)
            all_parent_users.append(list(parent_users))

        final_recommendations = []
        lower_threshold = 5

        for predicted_items_set, predicted_ratings_set, parent_item, parent_users\
        in zip(all_prediction_items, all_prediction_ratings, all_items, all_parent_users):
            transposed_ratings = np.array(predicted_ratings_set).T.tolist()
            for rating_set, it in zip(transposed_ratings, predicted_items_set):
                count = 0
                for r in rating_set:
                    if r >= lower_threshold:
                        count += 1
                if count == len(rating_set):
                    # additive
#                     final_recommendations.append((it, np.sum(rating_set), parent_item, parent_users))
                    # most pleasure
                    final_recommendations.append((it, np.max(rating_set), parent_item, parent_users))

        if len(final_recommendations) > 0:
            all_final_recommendations.append(final_recommendations)
            break

In [279]:
import pickle

with open('cf_list_group_recs_nm_mp.pkl', 'wb') as file:
    pickle.dump(all_final_recommendations, file)