In [3]:
# ==========================================================
# RESEARCH-GRADE RECOMMENDER SYSTEM COMPARISON
# ==========================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds


# ==========================================================
# 1. LOAD DATA
# ==========================================================

df1 = pd.read_csv("users_interactions.csv")
df2 = pd.read_csv("products_catalog.csv")

df1 = df1.drop("timestamp", axis=1)

train_df, test_df = train_test_split(
    df1,
    test_size=0.2,
    random_state=42
)

threshold = 4
K = 10

print("Train size:", len(train_df))
print("Test size:", len(test_df))


# ==========================================================
# 2. CREATE USER/ITEM MAPS
# ==========================================================

user_ids = train_df["user_id"].unique()
product_ids = train_df["product_id"].unique()

user_map = {u: i for i, u in enumerate(user_ids)}
item_map = {p: i for i, p in enumerate(product_ids)}

num_users = len(user_ids)
num_items = len(product_ids)


# ==========================================================
# 3. BUILD RATING MATRIX
# ==========================================================

rows = train_df["user_id"].map(user_map)
cols = train_df["product_id"].map(item_map)
ratings = train_df["rating"]

R = csr_matrix((ratings, (rows, cols)), shape=(num_users, num_items))


# ==========================================================
# 4. EVALUATION FUNCTION
# ==========================================================

def evaluate(predicted_matrix, model_name):

    y_true = []
    y_pred = []

    for _, row in test_df.iterrows():
        u = row["user_id"]
        i = row["product_id"]

        if u in user_map and i in item_map:
            u_idx = user_map[u]
            i_idx = item_map[i]

            y_true.append(row["rating"])
            y_pred.append(predicted_matrix[u_idx, i_idx])

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    y_true_bin = [1 if r >= threshold else 0 for r in y_true]
    y_pred_bin = [1 if r >= threshold else 0 for r in y_pred]

    precision = precision_score(y_true_bin, y_pred_bin)
    recall = recall_score(y_true_bin, y_pred_bin)
    f1 = f1_score(y_true_bin, y_pred_bin)

    # Precision@K / Recall@K
    precisions = []
    recalls = []

    for user in test_df["user_id"].unique():
        if user not in user_map:
            continue

        u_idx = user_map[user]
        user_preds = predicted_matrix[u_idx].copy()

        rated_items = R[u_idx].indices
        user_preds[rated_items] = -np.inf

        top_k_idx = np.argsort(user_preds)[-K:][::-1]
        top_k_items = set(product_ids[i] for i in top_k_idx)

        user_test = test_df[
            (test_df["user_id"] == user) &
            (test_df["rating"] >= threshold)
        ]

        relevant_items = set(user_test["product_id"])

        if len(relevant_items) == 0:
            continue

        hits = len(top_k_items & relevant_items)

        precisions.append(hits / K)
        recalls.append(hits / len(relevant_items))

    precision_k = np.mean(precisions)
    recall_k = np.mean(recalls)

    return {
        "Model": model_name,
        "RMSE": rmse,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "Precision@10": precision_k,
        "Recall@10": recall_k
    }


results = []


# ==========================================================
# 5. MODEL 1 — POPULARITY
# ==========================================================

item_mean = train_df.groupby("product_id")["rating"].mean()
global_mean = train_df["rating"].mean()

pop_pred = np.full((num_users, num_items), global_mean)

for item, mean in item_mean.items():
    if item in item_map:
        pop_pred[:, item_map[item]] = mean

results.append(evaluate(pop_pred, "Popularity"))


# ==========================================================
# 6. MODEL 2 — USER KNN
# ==========================================================

user_sim = cosine_similarity(R)
user_sim[np.isnan(user_sim)] = 0

user_knn_pred = user_sim @ R.toarray()
norm = np.abs(user_sim).sum(axis=1, keepdims=True)
user_knn_pred = user_knn_pred / np.where(norm == 0, 1, norm)

results.append(evaluate(user_knn_pred, "User-KNN"))


# ==========================================================
# 7. MODEL 3 — ITEM KNN
# ==========================================================

item_sim = cosine_similarity(R.T)
item_sim[np.isnan(item_sim)] = 0

item_knn_pred = R.toarray() @ item_sim
norm = np.abs(item_sim).sum(axis=1)
item_knn_pred = item_knn_pred / np.where(norm == 0, 1, norm)

results.append(evaluate(item_knn_pred, "Item-KNN"))


# ==========================================================
# 8. MODEL 4 — VANILLA SVD
# ==========================================================

U, sigma, Vt = svds(R, k=50)
sigma = np.diag(sigma)

svd_pred = U @ sigma @ Vt
svd_pred = np.clip(svd_pred, 1, 5)

results.append(evaluate(svd_pred, "Vanilla SVD"))


# ==========================================================
# 9. MODEL 5 — BIAS-SVD
# ==========================================================

global_mean = train_df["rating"].mean()

user_bias = np.array([
    R[u].data.mean() - global_mean if len(R[u].data) > 0 else 0
    for u in range(num_users)
])

item_bias = np.array([
    R[:, i].data.mean() - global_mean if len(R[:, i].data) > 0 else 0
    for i in range(num_items)
])

R_centered = R.copy().tolil()

for u in range(num_users):
    for idx in range(len(R_centered.data[u])):
        i = R_centered.rows[u][idx]
        R_centered.data[u][idx] -= (global_mean + user_bias[u] + item_bias[i])

R_centered = R_centered.tocsr()

U, sigma, Vt = svds(R_centered, k=50)
sigma = np.diag(sigma)

bias_svd_pred = U @ sigma @ Vt

for u in range(num_users):
    bias_svd_pred[u, :] += global_mean + user_bias[u] + item_bias

bias_svd_pred = np.clip(bias_svd_pred, 1, 5)

results.append(evaluate(bias_svd_pred, "Bias-SVD"))


# ==========================================================
# 10. FINAL RESULTS TABLE
# ==========================================================

results_df = pd.DataFrame(results)
print("\n========== FINAL COMPARISON ==========\n")
print(results_df)

Train size: 22001
Test size: 5501


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])




         Model      RMSE  Precision    Recall        F1  Precision@10  \
0   Popularity  1.142827   0.646688  0.056881  0.104565      0.004654   
1     User-KNN  3.694987   0.000000  0.000000  0.000000      0.022751   
2     Item-KNN  3.726156   0.000000  0.000000  0.000000      0.023681   
3  Vanilla SVD  2.969583   0.000000  0.000000  0.000000      0.019855   
4     Bias-SVD  1.180370   0.655943  0.203663  0.310819      0.005791   

   Recall@10  
0   0.013113  
1   0.058275  
2   0.061673  
3   0.049238  
4   0.016463  
