In [3]:
# !pip install lenskit

In [1]:
import pandas as pd

from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
from lenskit.algorithms.item_knn import ItemItem

from sklearn.model_selection import train_test_split, KFold

import numpy as np
import statistics

In [2]:
data_folder = '../data/'

item_ratings_df = pd.read_csv(data_folder + 'user_ratings.csv')
# item_ratings_df = pd.read_csv(data_folder + 'user_ratings_600k.csv')
# item_ratings_df = pd.read_csv(data_folder + 'user_ratings_1.8m.csv')
item_info = pd.read_csv(data_folder + 'games.csv')
item_mecahnics = pd.read_csv(data_folder + 'mechanics.csv')
item_subcategories = pd.read_csv(data_folder + 'subcategories.csv')
item_themes = pd.read_csv(data_folder + 'themes.csv')

In [3]:
item_ratings_df = item_ratings_df.rename(columns={
    'BGGId':'item',
    'Rating':'rating',
    'Username':'user'
})

In [4]:
item_ratings_df

Unnamed: 0,item,rating,user
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP
...,...,...,...
18942210,165521,3.0,rseater
18942211,165521,3.0,Bluefox86
18942212,165521,3.0,serginator
18942213,193488,1.0,CaptainCattan


In [5]:
user_rating_count = item_ratings_df.groupby('user').count()
user_rating_count

Unnamed: 0_level_0,item,rating
user,Unnamed: 1_level_1,Unnamed: 2_level_1
Fu_Koios,2,2
beastvol,9,9
mycroft,14,14
woh,5,5
(mostly) harmless,1,1
...,...,...
zzzuzu,39,39
zzzvone,21,21
zzzxxxyyy,36,36
zzzzzane,154,154


In [6]:
min_rating_count = 10
user_rating_count = user_rating_count[user_rating_count['rating'] >= min_rating_count]

In [7]:
user_list_filtered = list(user_rating_count.index)

In [8]:
len(item_ratings_df['user'].unique())

411375

In [9]:
len(user_list_filtered)

224604

In [10]:
item_ratings_df_filtered = item_ratings_df[item_ratings_df['user'].isin(user_list_filtered)]

In [11]:
len(item_ratings_df_filtered['user'].unique())

224604

In [12]:
# item_ratings_df_filtered.drop(columns = ['Unnamed: 0'], inplace = True)

In [13]:
item_ratings_df_filtered

Unnamed: 0,item,rating,user
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP
...,...,...,...
18942210,165521,3.0,rseater
18942211,165521,3.0,Bluefox86
18942212,165521,3.0,serginator
18942213,193488,1.0,CaptainCattan


### Build User CF

In [22]:
num_recs = 10  # Number of recommendations to generate
user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(item_ratings_df_filtered)

could not load LIBBLAS: Could not find module 'libblas' (or one of its dependencies). Try using the full path with constructor syntax.


<lenskit.algorithms.ranking.TopN at 0x2450e9045b0>

In [24]:
recsys.recommend('zzzabiss', 10)

Unnamed: 0,item,score
0,31790,8.515042
1,1035,8.472967
2,23418,8.434086
3,18098,8.184158
4,129556,7.91545
5,60153,7.827937
6,278292,7.727529
7,421,7.69295
8,342942,7.691776
9,225244,7.688135


#### Test User CF

In [88]:
# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
item_ratings_df_filtered

In [None]:
kf = KFold(n_splits=5, shuffle=True)

In [107]:
mae_list = list()
rmse_list = list()
nans_list = list()
non_nans_list = list()

count = 1

for train_index, test_index in kf.split(item_ratings_df_filtered):
    print(f'Doing fold no: {count}')
    count += 1
    
    train_df = item_ratings_df_filtered.iloc[train_index]
    test_df = item_ratings_df_filtered.iloc[test_index]
    print('Dataframes created')
    
    user_user = UserUser(10, min_nbrs=3)
    recsys = Recommender.adapt(user_user)
    recsys.fit(train_df)
    print('Model is fitted')
    
    predicted = recsys.predict(test_df.sample(frac = 0.001))
    
    y_test = list(test_df['rating'])
    y_pred = list(predicted)
    
    y_test_not_nan = list()
    y_pred_not_nan = list()
    
    nans = 0
    non_nans = 0
    
    for i in range(len(y_pred)):
        num = y_pred[i]
        
        if np.isnan(num):
            nans += 1
        else:
            non_nans += 1
            y_test_not_nan.append(y_test[i])
            y_pred_not_nan.append(num)
            
    print(f'nan values: {nans}')
    print(f'non-nan values: {non_nans}')
    
    mae = mean_absolute_error(y_test_not_nan, y_pred_not_nan)
    rmse = mean_squared_error(y_test_not_nan, y_pred_not_nan, squared = False)
    
    mae_list.append(mae)
    rmse_list.append(rmse)
    nans_list.append(nans)
    non_nans_list.append(non_nans)
    
# Compute average results and standard deviations
results = {
    'avg_mae' : statistics.mean(mae_list),
    'stdv_mae' : statistics.stdev(mae_list),
    'avg_rmse' : statistics.mean(rmse_list),
    'stdv_rmse' : statistics.stdev(rmse_list),
    'avg_nans': statistics.mean(nans_list),
    'avg_non_nans' : statistics.mean(non_nans_list)
}

display(results)

Doing fold no: 1
Dataframes created
Model is fitted
nan values: 167
non-nan values: 3508
Doing fold no: 2
Dataframes created
Model is fitted
nan values: 145
non-nan values: 3530
Doing fold no: 3
Dataframes created
Model is fitted
nan values: 162
non-nan values: 3513
Doing fold no: 4
Dataframes created
Model is fitted
nan values: 138
non-nan values: 3537
Doing fold no: 5
Dataframes created
Model is fitted
nan values: 130
non-nan values: 3545


{'avg_mae': 2.4439147110217045,
 'stdv_mae': 0.03579011749643922,
 'avg_rmse': 2.9253392472317494,
 'stdv_rmse': 0.04126334411082854,
 'avg_nans': 148.4,
 'avg_non_nans': 3526.6}

test_df.sample(frac = 0.001)\
user_user = UserUser(10, min_nbrs=3)\

{'avg_mae': 2.4439147110217045,\
 'stdv_mae': 0.03579011749643922,\
 'avg_rmse': 2.9253392472317494,\
 'stdv_rmse': 0.04126334411082854,\
 'avg_nans': 148.4,\
 'avg_non_nans': 3526.6}