In [26]:
import pandas as pd
import seaborn as sns
from surprise import (Reader, Dataset, KNNWithMeans)

In [27]:
orginal_df = pd.read_csv("/mnt/data/public/bgg/bgg-19m-reviews.csv")

In [28]:
orginal_df = orginal_df.iloc[:,1:]

# EDA

In [29]:
orginal_df.describe()

Unnamed: 0,rating,ID
count,18964810.0,18964810.0
mean,7.081849,110146.1
std,1.595931,93095.89
min,1.4013e-45,1.0
25%,6.0,15987.0
50%,7.0,107529.0
75%,8.0,181304.0
max,10.0,350992.0


In [30]:
orginal_df["rating"].idxmin()

2615991

In [31]:
orginal_df.iloc[2615991,:]

user                                               GeoffMack
rating                                                   0.0
comment    Avoid this game.  Unless you like unusual game...
ID                                                        42
name                                      Tigris & Euphrates
Name: 2615991, dtype: object

In [32]:
duplicates = orginal_df[orginal_df.duplicated(subset=["user", "name"], keep=False)]

In [33]:
duplicates.sort_values(by="user")

Unnamed: 0,user,rating,comment,ID,name
3386467,-=Yod@=-,6.0,,478,Citadels
9914865,-=Yod@=-,7.0,,3201,Lord of the Rings: The Confrontation
7796340,-=Yod@=-,8.0,,205398,Citadels
10456011,-=Yod@=-,7.5,,18833,Lord of the Rings: The Confrontation
7731530,-Johnny-,8.0,Original. It has more than its fair share of p...,98,Axis & Allies
...,...,...,...,...,...
18662068,zzini,6.8,,16719,Manhattan
10378681,zzyzewitsch,8.0,,121,Dune
11068435,zzyzewitsch,7.0,,283355,Dune
1916530,zzzabiss,8.0,Va muy bien como un filler introductorio. He r...,129622,Love Letter


In [34]:
sample = pd.read_csv("/mnt/data/public/bgg/games_detailed_info.csv")

  sample = pd.read_csv("/mnt/data/public/bgg/games_detailed_info.csv")


In [35]:
print(sample[sample["id"] == 129622])
print(sample[sample["id"] == 277085])

    Unnamed: 0       type      id  \
17          17  boardgame  129622   

                                            thumbnail  \
17  https://cf.geekdo-images.com/T1ltXwapFUtghS9A7...   

                                                image      primary  \
17  https://cf.geekdo-images.com/T1ltXwapFUtghS9A7...  Love Letter   

                                            alternate  \
17  ['Letters to Santa', 'List Miłosny', 'Lista Sk...   

                                          description  yearpublished  \
17  All of the eligible young men (and many of the...           2012   

    minplayers  ...  War Game Rank Customizable Rank Children's Game Rank  \
17           2  ...            NaN               NaN                  NaN   

   RPG Item Rank  Accessory Rank  Video Game Rank  Amiga Rank  \
17           NaN             NaN              NaN         NaN   

    Commodore 64 Rank Arcade Rank Atari ST Rank  
17                NaN         NaN           NaN  

[1 rows x 56 columns]


they just save the common names under names, but same name have different editions (example, the two rows below are the regular vs premium edition)

based on my opinion, i suggest to sort by item ID since premium editions may come at different price points/qualities

In [36]:
duplicates = orginal_df[orginal_df.duplicated(subset=["user", "ID"], keep=False)]

In [37]:
duplicates.sort_values(by="user")

Unnamed: 0,user,rating,comment,ID,name


if we sort by ID though, no duplicates

# Use Surprise (for neighborhood based)

In [38]:
orginal_df = orginal_df[["user","rating","ID"]]
orginal_df = orginal_df.rename(columns={"ID": "item"})

In [39]:
# Keep only games with at least 500 reviews
min_item_reviews = 1000
item_mask = orginal_df['item'].value_counts() >= min_item_reviews
item_over_min = orginal_df['item'].value_counts()[item_mask].index

# Optionally, filter users with at least 20 ratings
min_user_reviews = 20
user_mask = orginal_df['user'].value_counts() >= min_user_reviews
user_over_min = orginal_df['user'].value_counts()[user_mask].index

df = orginal_df[
        orginal_df['user'].isin(user_over_min) & 
        orginal_df['item'].isin(item_over_min)
]


In [40]:
# --- Prepare Data for Surprise ---
# Re-order the columns for Surprise: (user, item, rating)
df_for_surprise = df[['user', 'item', 'rating']]

# Define the Reader
reader = Reader(rating_scale=(0, 10))

# Load the *correctly ordered* DataFrame into the dataset
print("\nLoading data into Surprise dataset...")
dataset = Dataset.load_from_df(df_for_surprise, reader)
print("Data loaded successfully.")


Loading data into Surprise dataset...
Data loaded successfully.


In [41]:
# Show the shape and head of the filtered DataFrame 'df'
print(f"Shape of the filtered DataFrame: {df.shape}")
df.head()

Shape of the filtered DataFrame: (14354838, 3)


Unnamed: 0,user,rating,item
0,Torsten,10.0,30549
1,mitnachtKAUBO-I,10.0,30549
2,avlawn,10.0,30549
3,Mike Mayer,10.0,30549
4,Mease19,10.0,30549


In [None]:
from surprise.model_selection import cross_validate

# --- Run Cross-Validation for Item-Based CF ---

# 1. Configure for Item-based CF
sim_options = {
    'user_based': False  # This makes it item-based
}

# 2. Instantiate the algorithm
algo = KNNWithMeans(sim_options=sim_options)

# 3. Run 5-fold cross-validation
print("\nRunning 5-fold cross-validation for Item-Based KNN...")
results = cross_validate(algo, dataset, measures=['RMSE'], cv=5, verbose=True)

print("\nCross-validation finished.")


Running 5-fold cross-validation for Item-Based KNN...


In [42]:
from surprise import SVD
from surprise.model_selection import cross_validate

# --- Run Cross-Validation for Model-Based CF (SVD) ---

# 1. Instantiate the SVD algorithm
algo = SVD()

# 2. Run 5-fold cross-validation
print("\nRunning 5-fold cross-validation for SVD (Model-Based)...")
results = cross_validate(algo, dataset, measures=['RMSE'], cv=5, verbose=True)

print("\nCross-validation finished.")


Running 5-fold cross-validation for SVD (Model-Based)...
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1942  1.1932  1.1930  1.1936  1.1941  1.1936  0.0005  
Fit time          162.16  172.35  172.02  171.55  200.89  175.79  13.12   
Test time         32.68   40.85   41.65   33.27   48.36   39.36   5.83    

Cross-validation finished.
