In [36]:
import pandas as pd
import seaborn as sns
from surprise import (Reader, Dataset, KNNWithMeans)

In [37]:
orginal_df = pd.read_csv("/mnt/data/public/bgg/bgg-19m-reviews.csv")

In [38]:
orginal_df = orginal_df.iloc[:,1:]

# JAMIE PRE PROCESSING

In [1]:
import pandas as pd
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate, GridSearchCV
from surprise import KNNWithMeans, SVD
import time

# -----------------
# 1. DATA PREPROCESSING (FULL)
# -----------------
print("Loading original CSV (FULL DATASET)...")
start_time = time.time()

# --- Use dtype optimization for faster loading and less memory ---
# We know the types, so let's specify them.
# user and item are IDs (integers), rating is a float
dtypes = {
    'rating': 'float32',
    'ID': 'int32'
}

# We only need these three columns, so we'll specify them.
original_df = pd.read_csv(
    "/mnt/data/public/bgg/bgg-19m-reviews.csv",
    usecols=["user", "rating", "ID"],
    dtype=dtypes
)

# 2. Preprocess the data
original_df = original_df.rename(columns={"ID": "item"})
print(f"Data loaded and renamed in {time.time() - start_time:.2f}s.")
print(f"Total original ratings: {len(original_df)}")

# -----------------
# 3. APPLY NEW FILTERS
# -----------------

# --- Get Top 500 Items ---
print("Finding top 500 most-reviewed games...")
item_counts = original_df['item'].value_counts()
top_500_items = item_counts.head(500).index # <--- CHANGED FROM 1000
print(f"Identified {len(top_500_items)} top games.")

# --- First Filter: Keep only ratings for the Top 500 games ---
print("Applying top 500 games filter...") # <--- UPDATED PRINT
df_top_games = original_df[original_df['item'].isin(top_500_items)]
print(f"Ratings remaining (top 500 games): {len(df_top_games)}") # <--- UPDATED PRINT

# --- Get Users with > 20 reviews *within the subset* ---
# This is crucial: we count reviews *after* filtering for games.
print("Finding users with > 20 reviews *within the top-500 subset*...") # <--- UPDATED PRINT
user_counts_subset = df_top_games['user'].value_counts()
user_mask = user_counts_subset > 44
users_to_keep = user_counts_subset[user_mask].index
print(f"Users to keep (with > 20 reviews): {len(users_to_keep)}")

# --- Second Filter: Apply the user filter to the game-filtered data ---
print("Applying user filter...")
df = df_top_games[df_top_games['user'].isin(users_to_keep)]

# --- Final Check ---
print("Filtering complete.")
print(f"Final Users: {df['user'].nunique()}")
print(f"Final Items: {df['item'].nunique()}")


# 4. Prepare data for Surprise
df_for_surprise = df[['user', 'item', 'rating']]
reader = Reader(rating_scale=(0, 10))

print("Loading filtered data into Surprise dataset...")
dataset = Dataset.load_from_df(df_for_surprise, reader)
print(f"Data loaded successfully. Total ratings for training: {len(df_for_surprise)}")

Loading original CSV (FULL DATASET)...
Data loaded and renamed in 26.45s.
Total original ratings: 18964807
Finding top 500 most-reviewed games...
Identified 500 top games.
Applying top 500 games filter...
Ratings remaining (top 500 games): 9472466
Finding users with > 20 reviews *within the top-500 subset*...
Users to keep (with > 20 reviews): 63497
Applying user filter...
Filtering complete.
Final Users: 63497
Final Items: 500
Loading filtered data into Surprise dataset...
Data loaded successfully. Total ratings for training: 6166422


# ITEM BASED

In [14]:
# --- Item-Based CF (KNNWithMeans) Hyperparameter Tuning ---
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans
import time

print("\nStarting Item-Based CF tuning (3-fold CV, Simplified Grid)...") # Updated print
start_time_item = time.time()

# --- Grid 1: MSD ---
print("Tuning KNNWithMeans (MSD)...")
param_grid_msd = {
    'k': [20, 60], # <-- SIMPLIFIED
    'sim_options': {
        'name': ['msd'],
        'user_based': [False],
        'min_support': [1] # msd is safe
    }
}
gs_msd = GridSearchCV(KNNWithMeans, param_grid_msd, measures=['rmse'], cv=3, n_jobs=-1)
gs_msd.fit(dataset)

print(f"\n--- MSD Tuning Complete (took {time.time() - start_time_item:.2f}s) ---")
print(f"Best RMSE (MSD): {gs_msd.best_score['rmse']:.4f}")
print(f"Best Params (MSD): {gs_msd.best_params['rmse']}")


print("\n--- Starting Item-Based CF Tuning (Pearson) ---")
start_time_pearson = time.time()

# --- Grid 2: Pearson ---
param_grid_pearson = {
    'k': [20, 60], # <-- SIMPLIFIED
    'sim_options': {
        'name': ['pearson_baseline'],
        'user_based': [False],
        'min_support': [10]  # Robust fix: Requires at least 10 common users
    }
}
gs_pearson = GridSearchCV(KNNWithMeans, param_grid_pearson, measures=['rmse'], cv=3, n_jobs=-1)
gs_pearson.fit(dataset)

print(f"\n--- Pearson Tuning Complete (took {time.time() - start_time_pearson:.2f}s) ---")
print(f"Best RMSE (Pearson): {gs_pearson.best_score['rmse']:.4f}")
print(f"Best Params (Pearson): {gs_pearson.best_params['rmse']}")

print(f"\n--- Item-Based CF Tuning Finished (Total time {time.time() - start_time_item:.2f}s) ---")


Starting Item-Based CF tuning (3-fold CV, Simplified Grid)...
Tuning KNNWithMeans (MSD)...

--- MSD Tuning Complete (took 423.75s) ---
Best RMSE (MSD): 1.1874
Best Params (MSD): {'k': 20, 'sim_options': {'name': 'msd', 'user_based': False, 'min_support': 1}}

--- Starting Item-Based CF Tuning (Pearson) ---

--- Pearson Tuning Complete (took 432.07s) ---
Best RMSE (Pearson): 1.1503
Best Params (Pearson): {'k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': False, 'min_support': 10}}

--- Item-Based CF Tuning Finished (Total time 855.82s) ---


# CGD

In [3]:
# --- Model-Based (SVD) Hyperparameter Tuning (Simplified) ---
from surprise.model_selection import GridSearchCV
from surprise import SVD
import time

print("\nStarting Model-Based (SVD) tuning (5-fold CV)...")
print("Tuning 'n_factors'...")
start_time_svd = time.time()

# 1. Define the simplified parameter grid
param_grid_model = {
    'n_factors': [50, 100, 150], # Test 50, 100, and 150 latent factors
}

# 2. Set up the GridSearchCV
gs_model_based = GridSearchCV(
    SVD,
    param_grid_model,
    measures=['rmse'],
    cv=3,  # Using 5-fold CV
    n_jobs=-1
)

# 3. Run the grid search
gs_model_based.fit(dataset)

print(f"\n--- Model-Based (SVD) Tuning Complete (took {time.time() - start_time_svd:.2f}s) ---")
print(f"Best RMSE: {gs_model_based.best_score['rmse']:.4f}")
print("Best Parameters:")
print(gs_model_based.best_params['rmse'])


Starting Model-Based (SVD) tuning (5-fold CV)...
Tuning 'n_factors'...

--- Model-Based (SVD) Tuning Complete (took 249.42s) ---
Best RMSE: 1.1934
Best Parameters:
{'n_factors': 50}


# FELIZE MODEL


In [None]:
# --- Item-Based CF (KNNWithMeans) Hyperparameter Tuning ---
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans
import time

print("\nStarting Item-Based CF tuning (3-fold CV, Simplified Grid)...") # Updated print
start_time_item = time.time()

# --- Grid 1: MSD ---
print("Tuning KNNWithMeans (MSD)...")
param_grid_msd = {
    'k': [20, 60], # <-- SIMPLIFIED
    'sim_options': {
        'name': ['msd'],
        'user_based': [True],
        'min_support': [1] # msd is safe
    }
}
gs_msd = GridSearchCV(KNNWithMeans, param_grid_msd, measures=['rmse'], cv=3, n_jobs=-1)
gs_msd.fit(dataset)

print(f"\n--- MSD Tuning Complete (took {time.time() - start_time_item:.2f}s) ---")
print(f"Best RMSE (MSD): {gs_msd.best_score['rmse']:.4f}")
print(f"Best Params (MSD): {gs_msd.best_params['rmse']}")


print("\n--- Starting Item-Based CF Tuning (Pearson) ---")
start_time_pearson = time.time()

# --- Grid 2: Pearson ---
param_grid_pearson = {
    'k': [20, 60], # <-- SIMPLIFIED
    'sim_options': {
        'name': ['pearson_baseline'],
        'user_based': [True],
        'min_support': [10]  # Robust fix: Requires at least 10 common users
    }
}
gs_pearson = GridSearchCV(KNNWithMeans, param_grid_pearson, measures=['rmse'], cv=3, n_jobs=-1)
gs_pearson.fit(dataset)

print(f"\n--- Pearson Tuning Complete (took {time.time() - start_time_pearson:.2f}s) ---")
print(f"Best RMSE (Pearson): {gs_pearson.best_score['rmse']:.4f}")
print(f"Best Params (Pearson): {gs_pearson.best_params['rmse']}")

print(f"\n--- Item-Based CF Tuning Finished (Total time {time.time() - start_time_item:.2f}s) ---")


Starting Item-Based CF tuning (3-fold CV, Simplified Grid)...
Tuning KNNWithMeans (MSD)...
