## 4. Model Building

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from scipy.sparse import coo_matrix

# Paths (adjusted relative to notebooks/)
ARTIFACTS_DIR = "../artifacts"
EMBEDDINGS_DIR = os.path.join(ARTIFACTS_DIR, "embeddings")
MODELS_DIR = os.path.join(ARTIFACTS_DIR, "models")
INDICES_DIR = os.path.join(ARTIFACTS_DIR, "indices")

DATA_FILTERED_PATH = "../data/processed/df_filtered.pkl"
ENCODED_ITEM_PROPS_PATH = "../data/processed/item_properties_encoded.pkl"

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(INDICES_DIR, exist_ok=True)

# Load filtered data
df_filtered = pd.read_pickle(DATA_FILTERED_PATH)
print(f"Loaded filtered data: {df_filtered.shape}")

# Load encoded item properties
encoded_item_props = pd.read_pickle(ENCODED_ITEM_PROPS_PATH)
print(f"Loaded encoded item properties: {encoded_item_props.shape}")

# 1. Popularity-based model
popularity = df_filtered['itemid'].value_counts()
popularity_df = popularity.reset_index()
popularity_df.columns = ['itemid', 'count']
popularity_df.to_parquet(os.path.join(MODELS_DIR, "popularity_model.parquet"))
print("Saved popularity model.")

# 2. Content-Based Filtering
# Clean encoded_item_props
bool_cols = encoded_item_props.select_dtypes(include=['bool']).columns
encoded_item_props[bool_cols] = encoded_item_props[bool_cols].astype(int)
obj_cols = encoded_item_props.select_dtypes(include=['object']).columns
encoded_item_props[obj_cols] = encoded_item_props[obj_cols].apply(pd.to_numeric, errors='coerce')
encoded_item_props = encoded_item_props.fillna(0)
encoded_item_props = encoded_item_props.select_dtypes(include=[np.number])
print(f"Cleaned encoded_item_props shape: {encoded_item_props.shape}")

# Restrict to top 10k popular items present in encoded_item_props
top_items = popularity_df['itemid'].head(10000).tolist()
subset_items = [i for i in top_items if i in encoded_item_props.index]
encoded_subset = encoded_item_props.loc[subset_items]
print(f"Computing similarity on subset: {encoded_subset.shape}")

item_features = encoded_subset.values
item_sim_matrix = cosine_similarity(item_features)

# Save similarity matrix and subset item list
sim_path = os.path.join(EMBEDDINGS_DIR, "item_similarity_sub.npy")
np.save(sim_path, item_sim_matrix)
encoded_subset.index.to_series().to_csv(os.path.join(EMBEDDINGS_DIR, "item_similarity_sub_items.csv"), index=False)
print(f"Saved item similarity matrix and subset items.")

def recommend_content_based(item_id, top_n=10):
    if item_id not in encoded_subset.index:
        return []
    idx = encoded_subset.index.get_loc(item_id)
    sim_scores = item_sim_matrix[idx]
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    return encoded_subset.index[top_indices].tolist()

print("Content-based filtering model built.")

# 3. KNN Collaborative Filtering
print("Building KNN collaborative filtering model...")

user_cat = df_filtered['visitorid'].astype('category')
item_cat = df_filtered['itemid'].astype('category')

user_codes = user_cat.cat.codes.values
item_codes = item_cat.cat.codes.values

from collections import Counter
pair_counts = Counter(zip(user_codes, item_codes))

rows, cols, data = zip(*[(u, i, pair_counts[(u,i)]) for u,i in pair_counts.keys()])
user_item_sparse = coo_matrix((data, (rows, cols)), shape=(user_cat.cat.categories.size, item_cat.cat.categories.size))

print(f"Sparse user-item matrix shape: {user_item_sparse.shape}, nnz={user_item_sparse.nnz}")

knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11, n_jobs=-1)
knn_model.fit(user_item_sparse.T)

with open(os.path.join(MODELS_DIR, "knn_model.pkl"), "wb") as f:
    pickle.dump(knn_model, f)
print("Saved KNN model.")

def recommend_collaborative_knn_sparse(item_id, top_n=10):
    if item_id not in item_cat.cat.categories:
        return []
    item_idx = item_cat.cat.categories.get_loc(item_id)
    distances, indices = knn_model.kneighbors(user_item_sparse.T.getrow(item_idx), n_neighbors=top_n+1)
    rec_indices = indices.flatten()[1:]  # skip itself
    return item_cat.cat.categories[rec_indices].tolist()

print("KNN collaborative filtering model built.")

# Save recommenders dictionary
with open(os.path.join(MODELS_DIR, "recommender_functions.pkl"), "wb") as f:
    pickle.dump({
        'recommend_content_based': recommend_content_based,
        'recommend_collaborative_knn_sparse': recommend_collaborative_knn_sparse,
    }, f)
print("Models and recommenders saved.")


Loaded filtered data: (833463, 12)
Loaded encoded item properties: (417053, 2272)
Saved popularity model.
Cleaned encoded_item_props shape: (417053, 2272)
Computing similarity on subset: (8966, 2272)
Saved item similarity matrix and subset items.
Content-based filtering model built.
Building KNN collaborative filtering model...
Sparse user-item matrix shape: (80112, 38977), nnz=450427
Saved KNN model.
KNN collaborative filtering model built.
Models and recommenders saved.
