In [1]:
# recommender_clustering.py
# Simple recommender built on user clustering (KMeans, Hierarchical, DBSCAN)
# Place this script and your "mixed dataset.csv" in same folder and run.
# If no interactions file exists, synthetic interactions will be generated.

import pandas as pd
import numpy as np
from pathlib import Path

# ------------------------
# Utilities / Robust OHE
# ------------------------
def make_onehot_encoder(**kwargs):
    from sklearn.preprocessing import OneHotEncoder
    try:
        return OneHotEncoder(**kwargs, sparse=False)
    except TypeError:
        return OneHotEncoder(**kwargs, sparse_output=False)

def preprocess_user_features(df):
    # encode categorical, scale numeric (simple, compatible)
    from sklearn.preprocessing import StandardScaler
    from sklearn.compose import ColumnTransformer

    numerical_cols = df.select_dtypes(exclude='object').columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()

    ohe = make_onehot_encoder(drop='first', handle_unknown='ignore')
    scaler = StandardScaler()
    ct = ColumnTransformer(
        transformers=[
            ('ohe', ohe, categorical_cols),
            ('scale', scaler, numerical_cols)
        ],
        remainder='drop'
    )
    X = ct.fit_transform(df)
    # build names (optional)
    names = []
    if categorical_cols:
        try:
            names = list(ct.named_transformers_['ohe'].get_feature_names_out(categorical_cols))
        except Exception:
            cats = ct.named_transformers_['ohe'].categories_
            for col, cats_vals in zip(categorical_cols, cats):
                for c in cats_vals[1:]:
                    names.append(f"{col}_{c}")
    names = names + numerical_cols
    X = np.asarray(X)
    return X, names, ct

# ------------------------
# Recommendation helpers
# ------------------------
def generate_synthetic_interactions(n_users, n_items=100, min_items=5, max_items=20, seed=42):
    rng = np.random.default_rng(seed)
    rows = []
    for u in range(n_users):
        k = rng.integers(min_items, max_items + 1)
        items = rng.choice(n_items, size=k, replace=False)
        for it in items:
            # rating or purchase count â€” use implicit 1
            rows.append({'user_id': u, 'item_id': int(it), 'rating': 1})
    return pd.DataFrame(rows)

def build_user_item_matrix(interactions, user_index_map, n_items):
    # implicit feedback matrix (users x items) with counts
    n_users = len(user_index_map)
    mat = np.zeros((n_users, n_items), dtype=int)
    for _, row in interactions.iterrows():
        uid = row['user_id']
        iid = int(row['item_id'])
        if uid in user_index_map:
            uidx = user_index_map[uid]
            if 0 <= iid < n_items:
                mat[uidx, iid] += int(row.get('rating', 1))
    return mat

def recommend_topN_by_cluster(user_idx, cluster_labels, user_item_matrix, N=5):
    """
    For a user index (0-based), find users in same cluster,
    sum their item counts, and recommend top N items not already consumed by the user.
    """
    label = cluster_labels[user_idx]
    if label == -1:
        # noise: recommend global popular
        pop = user_item_matrix.sum(axis=0)
    else:
        members = np.where(cluster_labels == label)[0]
        if len(members) == 0:
            pop = user_item_matrix.sum(axis=0)
        else:
            pop = user_item_matrix[members].sum(axis=0)
    # items the user already consumed
    consumed = np.where(user_item_matrix[user_idx] > 0)[0]
    # rank items by popularity
    ranked = np.argsort(-pop)
    # filter out consumed
    recs = [int(i) for i in ranked if i not in set(consumed)]
    return recs[:N]

# ------------------------
# Main flow
# ------------------------
def main():
    # 1) Load users (features)
    path = Path.cwd()
    # find CSV file that looks like mixed dataset
    possible = [f for f in path.glob("*.csv") if "mixed" in f.name.lower() and "dataset" in f.name.lower()]
    if possible:
        user_file = possible[0].name
    else:
        # fallback exact name
        if (path / "mixed dataset.csv").exists():
            user_file = "mixed dataset.csv"
        elif (path / "mixed_dataset.csv").exists():
            user_file = "mixed_dataset.csv"
        else:
            raise FileNotFoundError("Could not find your mixed dataset CSV. Put it in this folder and include 'mixed' and 'dataset' in the filename.")
    users = pd.read_csv(user_file)
    print("Loaded user features from:", user_file, "shape:", users.shape)

    # assign user ids if none present -- use index as user_id
    users = users.reset_index(drop=True)
    users['user_id'] = users.index  # ensure a user_id column for mapping

    # 2) Load or generate interactions.csv (user_id, item_id, rating)
    if (path / "interactions.csv").exists():
        interactions = pd.read_csv("interactions.csv")
        print("Loaded interactions.csv with", len(interactions), "rows")
    else:
        print("No interactions.csv found. Generating synthetic interactions (implicit) ...")
        interactions = generate_synthetic_interactions(n_users=len(users), n_items=200, min_items=5, max_items=25)
        # user ids in synthetic are 0..n-1, match our user_id
        interactions.to_csv("interactions.csv", index=False)
        print("Saved synthetic interactions.csv with", len(interactions), "rows")

    # 3) Preprocess user features for clustering
    feature_df = users.drop(columns=['user_id'], errors='ignore')
    X, names, _ = preprocess_user_features(feature_df)
    print("Preprocessed user features shape:", X.shape)

    # 4) Build user-item matrix
    n_items = int(interactions['item_id'].max()) + 1
    user_index_map = {uid: idx for idx, uid in enumerate(users['user_id'].tolist())}
    uim = build_user_item_matrix(interactions, user_index_map, n_items)
    print("Built user-item matrix:", uim.shape)

    # Clustering & recommending functions
    from sklearn.cluster import KMeans, AgglomerativeClustering
    from sklearn.cluster import DBSCAN

    # ------ KMeans -------
    k = 8
    km = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    km_labels = km.fit_predict(X)
    print("KMeans clusters:", np.unique(km_labels, return_counts=True))
    # recommendations per user
    recs_k = []
    for uid in users['user_id']:
        idx = user_index_map[uid]
        recs = recommend_topN_by_cluster(idx, km_labels, uim, N=10)
        recs_k.append({'user_id': int(uid), 'recommendations': recs})
    pd.DataFrame(recs_k).to_csv("recommendations_kmeans.csv", index=False)
    print("Saved recommendations_kmeans.csv")

    # ------ Hierarchical -------
    H = 8
    agg = AgglomerativeClustering(n_clusters=H, linkage='ward')
    agg_labels = agg.fit_predict(X)
    print("Hierarchical clusters:", np.unique(agg_labels, return_counts=True))
    recs_h = []
    for uid in users['user_id']:
        idx = user_index_map[uid]
        recs = recommend_topN_by_cluster(idx, agg_labels, uim, N=10)
        recs_h.append({'user_id': int(uid), 'recommendations': recs})
    pd.DataFrame(recs_h).to_csv("recommendations_hierarchical.csv", index=False)
    print("Saved recommendations_hierarchical.csv")

    # ------ DBSCAN -------
    # choose eps heuristically: use median of k-distances approach simplified by scaling
    # We'll try a small eps and fallback if all noise.
    db = DBSCAN(eps=1.5, min_samples=X.shape[1]+1)
    db_labels = db.fit_predict(X)
    # if DBSCAN produced all -1 (unlikely), try a smaller eps
    if np.all(db_labels == -1):
        db = DBSCAN(eps=0.8, min_samples=5)
        db_labels = db.fit_predict(X)
    print("DBSCAN clusters unique labels:", np.unique(db_labels, return_counts=True))
    recs_d = []
    for uid in users['user_id']:
        idx = user_index_map[uid]
        recs = recommend_topN_by_cluster(idx, db_labels, uim, N=10)
        recs_d.append({'user_id': int(uid), 'recommendations': recs})
    pd.DataFrame(recs_d).to_csv("recommendations_dbscan.csv", index=False)
    print("Saved recommendations_dbscan.csv")

    # Print examples
    print("\nExample recommendations (user_id -> top 5 items):")
    for method, df_rec in [("KMeans", recs_k), ("Hierarchical", recs_h), ("DBSCAN", recs_d)]:
        sample = df_rec[0]  # first user
        print(method, "user", sample['user_id'], "->", sample['recommendations'][:5])

if __name__ == "__main__":
    main()


Loaded user features from: mixed_dataset.csv shape: (10000, 10)
No interactions.csv found. Generating synthetic interactions (implicit) ...
Saved synthetic interactions.csv with 149484 rows
Preprocessed user features shape: (10000, 19)
Built user-item matrix: (10000, 200)
KMeans clusters: (array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32), array([1162, 1429, 1424, 1189, 1130, 1314, 1308, 1044]))
Saved recommendations_kmeans.csv
Hierarchical clusters: (array([0, 1, 2, 3, 4, 5, 6, 7]), array([1785, 1771, 1265,  980,  959,  998, 1176, 1066]))
Saved recommendations_hierarchical.csv
DBSCAN clusters unique labels: (array([-1]), array([10000]))
Saved recommendations_dbscan.csv

Example recommendations (user_id -> top 5 items):
KMeans user 0 -> [141, 183, 157, 6, 108]
Hierarchical user 0 -> [105, 183, 141, 64, 147]
DBSCAN user 0 -> [105, 54, 183, 106, 72]
