# Alternating Least Squares (ALS) Matrix Factorization

- Load and validate the **subset_ratings.csv** file  
- Build an **item × user CSR matrix** (ratings as confidence)  
- Fit an **explicit‑feedback ALS** model (latent factors, regularization, 15 iter)  
- Map internal indices back to original `movieId` / `userId` for results  
- Split data per user (80 % train / 20 % test) and report **RMSE / MAE**  
- Output **Top‑N recommendations** for 1 000 sampled users to JSON

In [15]:
import pandas as pd
import numpy as np
from scipy import sparse
from implicit.als import AlternatingLeastSquares
from pathlib import Path
import json, random
from tqdm import tqdm

In [16]:
ratings = pd.read_csv("subset_ratings.csv", usecols=["userId", "movieId", "rating"])
print("Loaded:", ratings.shape)

Loaded: (2079356, 3)


In [17]:
def split_per_user(df, test_frac=0.2, seed=7):
    rng = np.random.default_rng(seed)
    train, test = [], []
    for _, grp in df.groupby("userId"):
        if len(grp) < 2:
            train.append(grp)
        else:
            mask = rng.random(len(grp)) < (1 - test_frac)
            train.append(grp[mask])
            test.append(grp[~mask])
    return pd.concat(train), pd.concat(test)

train_df, test_df = split_per_user(ratings)
print("train / test:", train_df.shape, test_df.shape)

train / test: (1664632, 3) (414724, 3)


In [None]:
user_ids = train_df.userId.unique()       
item_ids_raw = train_df.movieId.unique()  

uid_map = {u: i for i, u in enumerate(user_ids)}
mid_map = {m: i for i, m in enumerate(item_ids_raw)}

idx2mid = np.empty(len(item_ids_raw), dtype=np.int32)
for mid, idx in mid_map.items():
    idx2mid[idx] = mid


def df_to_csr(df):
    rows = df.movieId.map(mid_map)
    cols = df.userId.map(uid_map)
    data = df.rating.astype(np.float32)
    shape = (len(item_ids_raw), len(user_ids))
    return sparse.csr_matrix((data, (rows, cols)), shape=shape)

train_mat = df_to_csr(train_df)
print("CSR shape (items×users):", train_mat.shape)

CSR shape (items×users): (7020, 10000)


In [19]:
als = AlternatingLeastSquares(
    factors=64,
    regularization=0.1,
    iterations=15,
    use_native=True,
    dtype=np.float32,
)
als.fit(train_mat)

100%|██████████| 15/15 [00:00<00:00, 17.17it/s]


In [None]:
uid_test_idx = test_df.userId.map(uid_map)
mid_test_idx = test_df.movieId.map(lambda m: mid_map.get(m, -1))
mask = mid_test_idx >= 0

uid_vec = uid_test_idx.values[mask]
mid_vec = mid_test_idx.values[mask]

mask2 = uid_vec < als.user_factors.shape[0]
uid_vec, mid_vec = uid_vec[mask2], mid_vec[mask2]
y_true = test_df.rating.values[mask][mask2]

y_pred = np.sum(als.user_factors[uid_vec] * als.item_factors[mid_vec], axis=1)
rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
mae  = np.mean(np.abs(y_true - y_pred))
print(f"ALS  RMSE = {rmse:.4f}   MAE = {mae:.4f}")

ALS  RMSE = 3.6390   MAE = 3.4835


In [None]:
user_items_full = train_mat.T.tocsr() 

K, N = 10, 1_000
sample_users = random.sample(list(user_ids), min(N, len(user_ids)))

preds = {}
for u in tqdm(sample_users, desc=f"Top‑{K} recs"):
    uid = uid_map.get(u)
    if uid is None or uid >= als.user_factors.shape[0]:
        continue 

    rec_iids, _ = als.recommend(
        userid=uid,
        user_items=user_items_full[uid],  # 1‑row CSR
        N=K,
        filter_already_liked_items=True,
    )

    valid_rec_iids = [i for i in rec_iids if i < idx2mid.shape[0] and idx2mid[i] != -1]
    preds[int(u)]  = [int(idx2mid[i]) for i in valid_rec_iids]

Path("predictions").mkdir(exist_ok=True)
with open("predictions/als_top10_subset.json", "w") as fp:
    json.dump(preds, fp)

print(f"✅  Saved {len(preds)} users → predictions/als_top10_subset.json")

Top‑10 recs: 100%|██████████| 1000/1000 [00:00<00:00, 4330.34it/s]

✅  Saved 693 users → predictions/als_top10_subset.json



