In [1]:
from src.data import load_jsonl_part

items_df = load_jsonl_part('../data/processed/cleaned_items_df.jsonl', nrows=500000)
reviews_df = load_jsonl_part('../data/processed/cleaned_reviews_df.jsonl', nrows=5000000)

In [2]:
merged_df = reviews_df.merge(items_df, on='parent_asin', how='inner')
merged_df.head()

Unnamed: 0,rating,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title,average_rating,rating_number,features,description,price,store,categories,details,subtitle,author
0,5,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,True,All Beauty,Herbivore - Natural Sea Mist Texturizing Salt ...,4.3,384,[],"[If given the choice, weÕd leave most telltale...",,HERBIVORE,[],"{'Hair Type': 'Wavy', 'Material Type Free': 'D...",,
1,4,B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,True,All Beauty,All Natural Vegan Dry Shampoo Powder - Eco Fri...,4.0,56,[],[],,Two Goats Apothecary,[],"{'Brand': 'Two Goats Apothecary', 'Item Form':...",,
2,5,B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,All Beauty,New Road Beauty - Creamsicle - Variety 3 Pack ...,4.4,699,"[Same Great Product, NEW PACKAGING., MOISTURIZ...",[New Road Beauty Paraffin Wax is recommended f...,21.98,New Road Beauty,[],{'Package Dimensions': '10.5 x 6.4 x 1.6 inche...,,
3,1,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,True,All Beauty,muaowig Ombre Body Wave Bundles 1B Grey Human ...,1.0,1,[?Hair Bundle Material?:Brazilian Virgin Human...,[Hair Material: Brazilian Virgin Human Hair Bu...,,muaowig,[],"{'Brand': 'muaowig', 'Material': 'Human Hair',...",,
4,5,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,True,All Beauty,Yinhua Electric Nail Drill Kit Portable Profes...,3.5,20,[],[],,Yinhua,[],{'Package Dimensions': '8.5 x 3.82 x 2.24 inch...,,


In [3]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

df2 = merged_df.copy()

# --- Behavioral features per user ---
user_behavior = (
    df2.groupby("user_id")
       .agg(
           n_purchases=("parent_asin", "size"),
           n_products=("parent_asin", "nunique"),
           mean_rating=("rating", "mean"),
           std_rating=("rating", "std"),
           share_verified=("verified_purchase", "mean"),
           mean_helpful=("helpful_vote", "mean"),
       )
       .reset_index()
)

user_behavior["std_rating"] = user_behavior["std_rating"].fillna(0)

# reduce skew of counts/helpful votes
user_behavior["log_purchases"] = np.log1p(user_behavior["n_purchases"])
user_behavior["log_helpful"] = np.log1p(user_behavior["mean_helpful"])

# --- B) "What they buy": category distribution per user ---
# counts of purchases per (user, category)
user_cat_counts = (
    df2.groupby(["user_id", "main_category"])
       .size()
       .rename("cat_count")
       .reset_index()
)

# pivot to wide: one column per category
user_cat_wide = (
    user_cat_counts.pivot(index="user_id", columns="main_category", values="cat_count")
                  .fillna(0)
)

# convert counts to shares (so heavy buyers don't dominate purely by volume)
user_cat_share = user_cat_wide.div(user_cat_wide.sum(axis=1), axis=0).fillna(0)
user_cat_share.columns = [f"cat_share__{c}" for c in user_cat_share.columns]
user_cat_share = user_cat_share.reset_index()

# --- Combine features ---
user_features = user_behavior.merge(user_cat_share, on="user_id", how="left").fillna(0)

# Select feature columns (exclude user_id)
feature_cols = [c for c in user_features.columns if c != "user_id"]

X = user_features[feature_cols].to_numpy()

# Scale features
X_scaled = StandardScaler().fit_transform(X)

# Cluster (pick k)
k = 8
model = KMeans(n_clusters=k, random_state=42, n_init="auto")
user_features["cluster"] = model.fit_predict(X_scaled)

# --- Merge cluster labels back to the purchase-level dataset ---
df_with_clusters = df2.merge(user_features[["user_id", "cluster"]], on="user_id", how="left")

# Now you can "display clusters inside the dataset"
df_with_clusters["cluster"].value_counts().sort_index()


cluster
0    1360581
1    1207865
2     730946
3       3988
4          6
5     498143
6         26
7     144614
Name: count, dtype: int64

In [4]:
(df_with_clusters
 .sort_values(["cluster"])
 .groupby("cluster")
 .head(5)[["cluster", "user_id", "parent_asin", "main_category", "rating", "verified_purchase"]]
)

Unnamed: 0,cluster,user_id,parent_asin,main_category,rating,verified_purchase
3946152,0,AFRBBEV773SFFZEEVA724KW4G2KA,B000A2APXU,Digital Music,5,False
3946153,0,AFRBBEV773SFFZEEVA724KW4G2KA,B000A3DFZO,Digital Music,1,False
3946154,0,AFRBBEV773SFFZEEVA724KW4G2KA,B00005BGKP,Digital Music,5,False
3946155,0,AGI2EIHPIJHGLBKKJ77GDFEG3WTA,B003TJ4YUQ,Digital Music,4,True
3946156,0,AGI2EIHPIJHGLBKKJ77GDFEG3WTA,B002QECIDK,Digital Music,4,True
2568591,1,AE3LBWD3RSVHPQHVKRM5ZU7BHMNQ,B0080JIW4Y,Amazon Home,2,True
2568592,1,AGZPXHI54S5BVTUNAIOIEIVLNOAA,B0096T9LKG,Industrial & Scientific,5,True
2568469,1,AFXFAS26MBUYH7PROGAV57DQDTPQ,B08DBVYJPZ,Amazon Home,5,True
2568470,1,AE2N6PF7EFDW7OKO6YADEC57USGQ,B0BXSQN4HV,Amazon Home,5,True
2568472,1,AECUS6CK6XIKREMIUM6NGJOHYHOA,B09SCXV6TB,Industrial & Scientific,5,True


In [5]:
(user_features
 .groupby("cluster")[["n_purchases", "n_products", "mean_rating", "share_verified", "mean_helpful"]]
 .mean()
 .round(3)
 .sort_index()
)

Unnamed: 0_level_0,n_purchases,n_products,mean_rating,share_verified,mean_helpful
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2.71,2.709,4.389,0.849,1.145
1,1.079,1.078,4.073,0.942,0.783
2,1.211,1.21,4.328,0.98,0.392
3,1.176,1.176,4.017,0.95,0.671
4,6.0,6.0,4.5,1.0,0.167
5,1.106,1.106,4.056,0.943,1.812
6,1.3,1.3,4.2,0.85,2.1
7,112.628,112.623,4.378,0.443,4.052


In [6]:
cat_cols = [c for c in user_features.columns if c.startswith("cat_share__")]

(user_features
 .groupby("cluster")[cat_cols]
 .mean()
 .apply(lambda s: s.sort_values(ascending=False).head(5), axis=1)
)

Unnamed: 0_level_0,cat_share__All Beauty,cat_share__Amazon Home,cat_share__Appliances,cat_share__Baby,cat_share__Books,cat_share__Digital Music,cat_share__GPS & Navigation,cat_share__Grocery,cat_share__Industrial & Scientific,cat_share__Movies & TV,cat_share__Software,cat_share__Tools & Home Improvement
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0.055422,0.055241,0.047993,,,0.749973,,,,,,0.064083
1,0.49255,0.379124,,,,,,0.007779,0.062429,0.00663,,
2,0.006205,0.015578,0.023817,,,,,,0.004746,,,0.94747
3,0.019346,0.019272,0.012242,0.91411,,,,,,,,0.019543
4,0.166667,0.0,0.166667,,,,0.166667,,,,,0.5
5,0.003508,0.012988,0.972868,,,,,,0.00343,,,0.005268
6,0.05,0.05,,,,,,,,0.025,0.85,0.025
7,0.015107,,,,0.003751,0.957084,,,,0.017375,,0.002954
