In [23]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.preprocessing import StandardScaler

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

import lightgbm as lgb

# 1) Paths & load
DATA_DIR = Path().resolve().parent / "data"
TRAIN_FILE = DATA_DIR / "ratings.dat"
TEST_FILE = DATA_DIR / "ratings-Test.dat"
GENDER_FILE = DATA_DIR / "gender.dat"

train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)
gender_df = pd.read_csv(GENDER_FILE)

# Merge gender into both
train_df = train_df.merge(gender_df, on="userID", how="left")
test_df = test_df.merge(gender_df, on="userID", how="left")

# 2) Split train/validation
train_sub, val_sub = train_test_split(train_df, test_size=0.2, random_state=42)

In [16]:
def compute_baseline(df, λ=10):
    μ = df.rating.mean()
    b_i = df.groupby("profileID").rating.agg(lambda g: (g - μ).sum() / (len(g) + λ))
    b_u = df.groupby("userID").apply(
        lambda g: (g.rating - μ - b_i.reindex(g.profileID).values).sum() / (len(g) + λ)
    )
    return μ, b_i, b_u


def baseline_pred(u, i, μ, b_i, b_u):
    return μ + b_u.get(u, 0.0) + b_i.get(i, 0.0)


μ_sub, b_i_sub, b_u_sub = compute_baseline(train_sub)

# ————————————————
# 4) Residuals on train_sub
train_sub = train_sub.copy()
train_sub["baseline"] = train_sub.apply(
    lambda r: baseline_pred(r.userID, r.profileID, μ_sub, b_i_sub, b_u_sub), axis=1
)
train_sub["residual"] = train_sub["rating"] - train_sub["baseline"]

# ————————————————
# 5) Profile & User stats (on train_sub)
pf = train_sub.groupby("profileID").agg(
    avg_res=("residual", "mean"),
    count_res=("residual", "count"),
    avg_rating=("rating", "mean"),
    rating_std=("rating", "std"),
)
pf["log_count"] = np.log1p(pf["count_res"])
gender_counts = train_sub.pivot_table(
    index="profileID", columns="Gender", values="residual", aggfunc="count"
).fillna(0)
pf["p_female"] = gender_counts.get("F", 0) / pf["count_res"]
pf["p_male"] = gender_counts.get("M", 0) / pf["count_res"]
pf["p_unknown"] = gender_counts.get("U", 0) / pf["count_res"]

uf = train_sub.groupby("userID").agg(
    u_mean=("rating", "mean"), u_std=("rating", "std"), u_count=("rating", "count")
)
uf["u_logcount"] = np.log1p(uf["u_count"])

# ————————————————
# 6) Scaling
profile_cols = [
    "avg_res",
    "count_res",
    "avg_rating",
    "rating_std",
    "log_count",
    "p_female",
    "p_male",
    "p_unknown",
]
user_cols = ["u_mean", "u_std", "u_logcount"]

pf_scaler = StandardScaler().fit(pf[profile_cols].to_numpy())
uf_scaler = StandardScaler().fit(uf[user_cols].to_numpy())


# ————————————————
# 7) Dataset builder
def make_dataset(
    df, μ, b_i, b_u, pf_df, uf_df, pf_scaler, uf_scaler, profile_cols, user_cols
):
    df = df.copy()
    df["baseline"] = df.apply(
        lambda r: baseline_pred(r.userID, r.profileID, μ, b_i, b_u), axis=1
    )
    merged = df.merge(
        pf_df[profile_cols], left_on="profileID", right_index=True, how="left"
    ).merge(uf_df[user_cols], left_on="userID", right_index=True, how="left")
    merged[profile_cols] = merged[profile_cols].fillna(0)
    merged[user_cols] = merged[user_cols].fillna(0)

    Xp = pf_scaler.transform(merged[profile_cols].to_numpy())
    Xu = uf_scaler.transform(merged[user_cols].to_numpy())
    X = np.hstack([Xp, Xu])

    baseline = merged["baseline"].to_numpy()
    y_true = merged["rating"].to_numpy()
    y_res = y_true - baseline

    return X, y_res, baseline, y_true


X_train, y_train_res, base_train, _ = make_dataset(
    train_sub,
    μ_sub,
    b_i_sub,
    b_u_sub,
    pf,
    uf,
    pf_scaler,
    uf_scaler,
    profile_cols,
    user_cols,
)
X_val, y_val_res, base_val, y_val = make_dataset(
    val_sub,
    μ_sub,
    b_i_sub,
    b_u_sub,
    pf,
    uf,
    pf_scaler,
    uf_scaler,
    profile_cols,
    user_cols,
)

# ————————————————
# 8) Grid‐search LightGBM
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [5, 10],
}
best_mae, best_params = np.inf, None

for params in ParameterGrid(param_grid):
    mdl = lgb.LGBMRegressor(**params)
    mdl.fit(X_train, y_train_res)
    pred_res = mdl.predict(X_val)
    preds = base_val + pred_res
    mae = mean_absolute_error(y_val, preds)
    print(f"{params} → Val MAE = {mae:.4f}")
    if mae < best_mae:
        best_mae, best_params = mae, params

print(f"Best params: {best_params} → Val MAE = {best_mae:.4f}")

# ————————————————
# 9) Retrain on full training data
μ_full, b_i_full, b_u_full = compute_baseline(train_df)

# rebuild pf_full exactly as pf above, but using full df
pf_full = train_df.groupby("profileID").agg(
    avg_res=(
        "rating",
        lambda g: (g - μ_full - b_i_full.reindex(g.index).values).mean(),
    ),
    count_res=("rating", "count"),
    avg_rating=("rating", "mean"),
    rating_std=("rating", "std"),
)
pf_full["log_count"] = np.log1p(pf_full["count_res"])
gender_counts_full = train_df.pivot_table(
    index="profileID", columns="Gender", values="rating", aggfunc="count"
).fillna(0)
pf_full["p_female"] = gender_counts_full.get("F", 0) / pf_full["count_res"]
pf_full["p_male"] = gender_counts_full.get("M", 0) / pf_full["count_res"]
pf_full["p_unknown"] = gender_counts_full.get("U", 0) / pf_full["count_res"]

uf_full = train_df.groupby("userID").agg(
    u_mean=("rating", "mean"), u_std=("rating", "std"), u_count=("rating", "count")
)
uf_full["u_logcount"] = np.log1p(uf_full["u_count"])

pf_scaler_full = StandardScaler().fit(pf_full[profile_cols].to_numpy())
uf_scaler_full = StandardScaler().fit(uf_full[user_cols].to_numpy())

X_full, y_full_res, base_full, _ = make_dataset(
    train_df,
    μ_full,
    b_i_full,
    b_u_full,
    pf_full,
    uf_full,
    pf_scaler_full,
    uf_scaler_full,
    profile_cols,
    user_cols,
)

final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X_full, y_full_res)

# ————————————————
# 10) Test evaluation
X_test, y_test_res, base_test, y_test = make_dataset(
    test_df,
    μ_full,
    b_i_full,
    b_u_full,
    pf_full,
    uf_full,
    pf_scaler_full,
    uf_scaler_full,
    profile_cols,
    user_cols,
)
pred_res_test = final_model.predict(X_test)
pred_test = base_test + pred_res_test
test_mae = root_mean_squared_error(y_test, pred_test)
print(f"✨ Final Test MAE = {test_mae:.4f}")

  b_u = df.groupby("userID").apply(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100} → Val MAE = 1.3976
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200} → Val MAE = 1.3787
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 100} → Val MAE = 1.3866
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 200} → Val MAE = 1.3762
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100} → Val MAE = 1.3760
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200} → Val MAE = 1.3661
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100} → Val MAE = 1.3754
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2799
[LightGBM] [Info] Number of data points in the train set: 2576029, number of used features: 11
[LightGBM] [Info] Start training from score -0.004810




{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200} → Val MAE = 1.3655
Best params: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200} → Val MAE = 1.3655


  b_u = df.groupby("userID").apply(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2800
[LightGBM] [Info] Number of data points in the train set: 3220037, number of used features: 11
[LightGBM] [Info] Start training from score -0.003447
✨ Final Test MAE = 2.3473




In [14]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# 1) Paths & load
DATA_DIR = Path().resolve().parent / "data"
TRAIN_FILE = DATA_DIR / "ratings.dat"
TEST_FILE = DATA_DIR / "ratings-Test.dat"
GENDER_FILE = DATA_DIR / "gender.dat"

train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)
gender_df = pd.read_csv(GENDER_FILE)

# Merge gender into both sets
train_df = train_df.merge(gender_df, on="userID", how="left")
test_df = test_df.merge(gender_df, on="userID", how="left")

# One-hot encode user gender
for df in (train_df, test_df):
    df["is_female"] = (df["Gender"] == "F").astype(int)
    df["is_male"] = (df["Gender"] == "M").astype(int)

# 2) Compute profile-side stats on TRAIN ONLY
pf = train_df.groupby("profileID").agg(
    rating_count=("rating", "count"),
    avg_rating=("rating", "mean"),
    rating_std=("rating", "std"),
)
pf["log_count"] = np.log1p(pf["rating_count"])

# 3) Compute user-side stats on TRAIN ONLY
uf = train_df.groupby("userID").agg(
    u_count=("rating", "count"), u_mean=("rating", "mean"), u_std=("rating", "std")
)
uf["u_logcount"] = np.log1p(uf["u_count"])


# 4) Merge back to create feature‐augmented DataFrames
def augment(df):
    # profile features
    df = df.merge(pf, left_on="profileID", right_index=True, how="left")
    # user features
    df = df.merge(uf, left_on="userID", right_index=True, how="left")
    # fill NaNs (new profiles/users) with global means
    df["rating_count"].fillna(train_df["rating"].mean(), inplace=True)
    df["avg_rating"].fillna(train_df["rating"].mean(), inplace=True)
    df["rating_std"].fillna(0, inplace=True)
    df["log_count"].fillna(0, inplace=True)
    df["u_count"].fillna(0, inplace=True)
    df["u_mean"].fillna(train_df["rating"].mean(), inplace=True)
    df["u_std"].fillna(0, inplace=True)
    df["u_logcount"].fillna(0, inplace=True)
    return df


train_aug = augment(train_df)
test_aug = augment(test_df)

# 5) Define feature columns
feature_cols = [
    "rating_count",
    "avg_rating",
    "rating_std",
    "log_count",
    "u_count",
    "u_mean",
    "u_std",
    "u_logcount",
    "is_female",
    "is_male",
]

# 6) Split train → train_sub / val_sub
train_sub, val_sub = train_test_split(train_aug, test_size=0.2, random_state=42)

# 7) Scale features
scaler = StandardScaler()
scaler.fit(train_sub[feature_cols].to_numpy())


def prepare(df):
    X = scaler.transform(df[feature_cols].to_numpy())
    y = df["rating"].to_numpy()
    return X, y


X_train, y_train = prepare(train_sub)
X_val, y_val = prepare(val_sub)
X_test, y_test = prepare(test_aug)

# 8) Fit simplest regressor (Ridge)
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# 9) Evaluate on validation
val_preds = model.predict(X_val)
val_mae = mean_absolute_error(y_val, val_preds)
print(f"Validation MAE: {val_mae:.4f}")

# 10) Retrain on full training data
X_full, y_full = prepare(train_aug)
model.fit(X_full, y_full)

# 11) Final Test evaluation
test_preds = model.predict(X_test)
test_mae = mean_absolute_error(y_test, test_preds)
print(f"Final Test MAE (Ridge content model): {test_mae:.4f}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["rating_count"].fillna(train_df["rating"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["avg_rating"].fillna(train_df["rating"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

Validation MAE: 1.3737
Final Test MAE (Ridge content model): 2.2323


## KNN Based

### COllaborative

In [22]:
# 2) Compute baseline biases (μ + b_u + b_i)
mu = train_df["rating"].mean()
# item (profile) bias
b_i = train_df.groupby("profileID")["rating"].agg(
    lambda g: (g - mu).sum() / (len(g) + 10)
)
# user bias
b_u = train_df.groupby("userID").apply(
    lambda g: (g.rating - mu - b_i.reindex(g.profileID).values).sum() / (len(g) + 10)
)


def baseline_pred(u, i):
    return mu + b_u.get(u, 0.0) + b_i.get(i, 0.0)


# 3) Build the item–user residual matrix
users = train_df["userID"].unique()
items = train_df["profileID"].unique()
user_to_idx = {u: idx for idx, u in enumerate(users)}
item_to_idx = {i: idx for idx, i in enumerate(items)}

# rows = item index, cols = user index
rows = train_df["profileID"].map(item_to_idx)
cols = train_df["userID"].map(user_to_idx)
data = train_df.apply(lambda r: r.rating - baseline_pred(r.userID, r.profileID), axis=1)

item_user_mat = csr_matrix((data, (rows, cols)), shape=(len(items), len(users)))

# 4) Fit sklearn’s NearestNeighbors once
K = 25
knn = NearestNeighbors(
    n_neighbors=K + 1,  # include self→we’ll skip it in prediction
    metric="cosine",
    algorithm="brute",
    n_jobs=-1,
)
knn.fit(item_user_mat)


# 5) Prediction function using sklearn’s neighbors + baseline fallback
def predict_rating(u_id, p_id):
    # fallback if new user/item
    if u_id not in user_to_idx or p_id not in item_to_idx:
        return baseline_pred(u_id, p_id)

    u_idx = user_to_idx[u_id]
    i_idx = item_to_idx[p_id]

    # find neighbors
    dist, nbrs = knn.kneighbors(item_user_mat[i_idx], return_distance=True)
    dist = dist[0][1:]  # skip self
    nbr_i = nbrs[0][1:]

    # fetch this user’s residuals on those neighbor items
    # .toarray() is faster than tolist for single-column
    user_col = item_user_mat[nbr_i, u_idx].toarray().ravel()
    mask = user_col != 0
    if not mask.any():
        return baseline_pred(u_id, p_id)

    # weight by inverse distance
    valid_dist = dist[mask]
    valid_res = user_col[mask]
    weights = 1 / (valid_dist + 1e-8)

    resid_pred = (valid_res * weights).sum() / weights.sum()
    return baseline_pred(u_id, p_id) + resid_pred


# 6) Score on test set
test_df["pred"] = test_df.apply(lambda r: predict_rating(r.userID, r.profileID), axis=1)
rmse = root_mean_squared_error(test_df["rating"], test_df["pred"])
print(f"Content‐based CF w/ sklearn KNN → Test RMSE = {rmse:.4f}")

  b_u = train_df.groupby("userID").apply(


Content‐based CF w/ sklearn KNN → Test RMSE = 2.1111


### COntent

In [24]:
mu = train_df.rating.mean()
b_i = train_df.groupby("profileID").rating.agg(
    lambda g, μ=mu: (g - μ).sum() / (len(g) + 10)
)
b_u = train_df.groupby("userID").apply(
    lambda g, μ=mu, b_i=b_i: (g.rating - μ - b_i.reindex(g.profileID).values).sum()
    / (len(g) + 10)
)


def baseline(u, p):
    return mu + b_u.get(u, 0.0) + b_i.get(p, 0.0)


# 3) Build content features (profile side)
pf = train_df.groupby("profileID").agg(
    avg_rating=("rating", "mean"),
    rating_std=("rating", "std"),
    rating_count=("rating", "count"),
    f_count=("Gender", lambda x: (x == "F").sum()),
    m_count=("Gender", lambda x: (x == "M").sum()),
)
pf["p_female"] = pf["f_count"] / pf["rating_count"]
pf["p_male"] = pf["m_count"] / pf["rating_count"]
pf["log_count"] = np.log1p(pf["rating_count"])
pf = pf.fillna(0)

# which columns to use as content vector
features = ["avg_rating", "rating_std", "log_count", "p_female", "p_male"]
X_content = pf[features].to_numpy()

# 4) Fit sklearn NearestNeighbors on content
K = 25
nn = NearestNeighbors(n_neighbors=K + 1, metric="cosine", algorithm="brute", n_jobs=-1)
nn.fit(X_content)

# mapping profileID <-> row index
profile_list = pf.index.to_list()
profile_index = {pid: i for i, pid in enumerate(profile_list)}

# 5) Precompute per-user residual history
train_df["residual"] = train_df.apply(
    lambda r: r.rating - baseline(r.userID, r.profileID), axis=1
)
user_hist = (
    train_df.groupby("userID")
    .apply(lambda g: dict(zip(g.profileID, g.residual)))
    .to_dict()
)


# 6) Prediction function
def predict(u, p):
    base = baseline(u, p)
    if p not in profile_index or u not in user_hist:
        return base

    i = profile_index[p]
    dists, nbrs = nn.kneighbors(X_content[i].reshape(1, -1), return_distance=True)
    dists = dists[0][1:]  # skip self
    nbrs = nbrs[0][1:]
    sims = 1 - dists  # cosine → similarity

    hist = user_hist[u]
    sims_list, res_list = [], []
    for sim, j in zip(sims, nbrs):
        pid = profile_list[j]
        if pid in hist:
            sims_list.append(sim)
            res_list.append(hist[pid])
    if not sims_list:
        return base

    sims_arr = np.array(sims_list)
    res_arr = np.array(res_list)
    return base + (sims_arr @ res_arr) / sims_arr.sum()


# 7) Score on test set
test_df["pred"] = test_df.apply(lambda r: predict(r.userID, r.profileID), axis=1)
rmse = root_mean_squared_error(test_df.rating, test_df.pred)
print(f"Content-based + gender features → Test RMSE = {rmse:.4f}")

  b_u = train_df.groupby("userID").apply(
  .apply(lambda g: dict(zip(g.profileID, g.residual)))


Content-based + gender features → Test RMSE = 2.1115
