In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix, lil_matrix, save_npz
from implicit.als import AlternatingLeastSquares

Unnamed: 0,reviewerID,asin,overall,reviewText,unixReviewTime
0,A3D6FSS8RJFDE8,B006K551TO,3,I purchased this camera because we have a Sony...,1374624000
1,A3SO3G35DXC95Q,B000YB2IPK,4,I purchased this player a few days ago. I deci...,1201910400
2,A1CTVZ3W8XZWZ9,B00FXYT12G,5,Nikon DSLR cameras (with an APS-C sensor) have...,1395619200
3,AJDK7OUKLYX8,B0039UWW54,4,Modem works great but be prepared to have a ha...,1335139200
4,A1172I8JR1L2P6,B00D8JA2S0,5,"Mediabridge cables always work well for me, an...",1390780800


In [None]:
df = pd.read_csv("../data/processed/electronics_subset.csv")
df = df[
    ["reviewerID", "asin", "overall", "reviewText", "unixReviewTime"]
]
df["reviewText"] = df["reviewText"].fillna("").astype(str)
print(df.head())

In [None]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df["user_id"] = user_encoder.fit_transform(df["reviewerID"])
df["item_id"] = item_encoder.fit_transform(df["asin"])
print(df[["reviewerID", "user_id", "asin", "item_id"]].head())

In [None]:
interactions = coo_matrix(
    (df["overall"], (df["user_id"], df["item_id"]))
).tocsr()
print("Interaction matrix shape:", interactions.shape)
print("Non-zero entries:", interactions.nnz)

def compute_sparsity(X):
    return 1 - (X.nnz / (X.shape[0] * X.shape[1]))
print("Interactions sparsity:", compute_sparsity(interactions))

In [None]:
train = interactions.copy().tolil()
test_items = {}
for user in range(train.shape[0]):
    items = train.rows[user]
    if len(items) > 1:
        test_items[user] = items[-1]
        train[user, items[-1]] = 0
train = train.tocsr()
print("Train sparsity:", compute_sparsity(train))

In [None]:
als_model = AlternatingLeastSquares(
    factors=50,
    regularization=0.1,
    iterations=20,
    random_state=42
)
als_model.fit(train)

In [None]:
def precision_at_k(model, interactions, k=10):
    precisions = []
    for user in range(interactions.shape[0]):
        true_items = interactions[user].indices
        if len(true_items) == 0:
            continue
        rec_items, _ = model.recommend(user, interactions[user], N=k)
        precision = len(set(rec_items) & set(true_items)) / k
        precisions.append(precision)
    return sum(precisions) / len(precisions)

In [None]:
def precision_at_k_test(model, train, test_items, k=10):
    hits = 0
    for user, true_item in test_items.items():
        rec_items, _ = model.recommend(user, train[user], N=k)
        if true_item in rec_items:
            hits += 1
    return hits / len(test_items)

print("Precision@10 (full):", precision_at_k(als_model, interactions, k=10))
print("Precision@10 (test):", precision_at_k_test(als_model, train, test_items, k=10))

In [None]:
os.makedirs("../models", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

pickle.dump(als_model, open("../models/als_model.pkl", "wb"))
pickle.dump(user_encoder, open("../models/user_mapping.pkl", "wb"))
pickle.dump(item_encoder, open("../models/item_mapping.pkl", "wb"))

save_npz("../data/processed/interactions.npz", interactions)
save_npz("../data/processed/train.npz", train)

print("Saved model, encoders, and matrices.")