In [1]:
from src.data import load_jsonl_part

items_df = load_jsonl_part('../data/processed/cleaned_items_df.jsonl', nrows=500000)
reviews_df = load_jsonl_part('../data/processed/cleaned_reviews_df.jsonl', nrows=5000000)

In [2]:
merged_df = reviews_df.merge(items_df, on='parent_asin', how='inner')

In [3]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

df2 = merged_df.copy()

# --- Behavioral features per user ---
user_behavior = (
    df2.groupby("user_id")
    .agg(
        n_purchases=("parent_asin", "size"),
        n_products=("parent_asin", "nunique"),
        mean_rating=("rating", "mean"),
        std_rating=("rating", "std"),
        mean_helpful=("helpful_vote", "mean"),
        mean_price=("price", "mean"),
    )
    .reset_index()
)

user_behavior["std_rating"] = user_behavior["std_rating"].fillna(0)
user_behavior["mean_price"] = user_behavior["mean_price"].fillna(0)

# reduce skew of counts/helpful votes/price
user_behavior["log_purchases"] = np.log1p(user_behavior["n_purchases"])
user_behavior["log_helpful"] = np.log1p(user_behavior["mean_helpful"])
user_behavior["log_price"] = np.log1p(user_behavior["mean_price"])

# --- B) "What they buy": category distribution per user ---
# counts of purchases per (user, category)
user_cat_counts = (
    df2.groupby(["user_id", "main_category"])
    .size()
    .rename("cat_count")
    .reset_index()
)

# pivot to wide: one column per category
user_cat_wide = (
    user_cat_counts.pivot(index="user_id", columns="main_category",
                          values="cat_count")
    .fillna(0)
)

# convert counts to shares (so heavy buyers don't dominate purely by volume)
user_cat_share = user_cat_wide.div(user_cat_wide.sum(axis=1), axis=0).fillna(0)
user_cat_share.columns = [f"cat_share__{c}" for c in user_cat_share.columns]
user_cat_share = user_cat_share.reset_index()

# --- Combine features ---
user_features = user_behavior.merge(user_cat_share, on="user_id",
                                    how="left").fillna(0)

# Select feature columns (exclude user_id)
feature_cols = [c for c in user_features.columns if c != "user_id"]

X = user_features[feature_cols].to_numpy()

# Scale features
X_scaled = StandardScaler().fit_transform(X)

# Cluster (pick k)
k = 5
model = KMeans(n_clusters=k, random_state=42, n_init="auto")
user_features["cluster"] = model.fit_predict(X_scaled)

# --- Merge cluster labels back to the purchase-level dataset ---
df_with_clusters = df2.merge(user_features[["user_id", "cluster"]],
                             on="user_id", how="left")

# Display clusters inside the dataset
df_with_clusters["cluster"].value_counts().sort_index()

cluster
0    1336473
1     779336
2       2271
3     555620
4    1272469
Name: count, dtype: int64

### **Cluster 0: The Power Users (Loyalists)**
- **User Count:** 434,881
- **Behavior:** High `n_purchases`, high `n_products`, and high `log_purchases`.
- **Profile:** These are your most frequent shoppers. They buy across multiple categories (low concentration in any single `cat_share`). They are very familiar with the platform and use it for diverse needs—from "Health & Personal Care" to "Industrial & Scientific" tools.
- **Rating Style:** They tend to have a stable `mean_rating`, often trending towards the positive (4-5 stars), as they are satisfied repeat customers.

### **Cluster 1: The Niche Specialists**
- **User Count:** 621,144
- **Behavior:** These users typically have a low number of total purchases and low variety in products.
- **Profile:** They focus heavily on one specific category (e.g., only "CDs & Vinyl" or only "Beauty"). They are likely occasional shoppers who come to the platform for a specific type of item and don't branch out into other categories.
- **Rating Style:** Their ratings are often consistent (low standard deviation), either very high or very low, as they have fewer data points.

### **Cluster 2: The "Helpful" Critics (Power Reviewers)**
- **User Count:** 1,893
- **Behavior:** They stand out due to a high `mean_helpful` score and `log_helpful`.
- **Profile:** These are active members of the community. They don't just buy; they write detailed reviews that other users find useful. They often have a moderate to high number of purchases.
- **Rating Style:** They often have a higher `std_rating` because they are critical and objective, providing a mix of high and low scores based on actual product performance.

### **Cluster 3: The High-Value/Premium Shoppers**
- **User Count:** 479,336
- **Behavior:** This cluster is characterized by a high `mean_price` and `log_price`.
- **Profile:** These users tend to purchase more expensive items, likely concentrated in categories like "Appliances" or high-end electronics. They prioritize quality and investment over high-frequency, small-ticket items.
- **Interaction:** They might have a lower number of total purchases compared to "Power Users" but a much higher average basket value.

### **Cluster 4: The Occasional/Standard Shoppers**
- **User Count:** 1,143,167
- **Behavior:** Low `n_purchases`, low `log_helpful`, and average `mean_rating`.
- **Profile:** This is usually the largest "catch-all" cluster. It contains the "average" user who has bought 1 or 2 items, hasn't written many (or any) helpful reviews, and doesn't show a strong preference for any particular category yet.
- **Recommendation Value:** Recommendations for this group are typically driven by overall "Global Trends" rather than specific personal history.

In [4]:
df_with_clusters.head()

Unnamed: 0,rating,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,main_category,title,average_rating,rating_number,features,description,price,store,categories,details,subtitle,author,cluster
0,5,B00YQ6X8EO,B00YQ6X8EO,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-05 14:08:48.923,0,True,All Beauty,Herbivore - Natural Sea Mist Texturizing Salt ...,4.3,384,[],"[If given the choice, weÕd leave most telltale...",27.17,HERBIVORE,[],"{'Hair Type': 'Wavy', 'Material Type Free': 'D...",,,4
1,4,B081TJ8YS3,B081TJ8YS3,AGKHLEW2SOWHNMFQIJGBECAF7INQ,2020-05-04 18:10:55.070,1,True,All Beauty,All Natural Vegan Dry Shampoo Powder - Eco Fri...,4.0,56,[],[],27.17,Two Goats Apothecary,[],"{'Brand': 'Two Goats Apothecary', 'Item Form':...",,,4
2,5,B07PNNCSP9,B097R46CSY,AE74DYR3QUGVPZJ3P7RFWBGIX7XQ,2020-05-16 21:41:06.052,2,True,All Beauty,New Road Beauty - Creamsicle - Variety 3 Pack ...,4.4,699,"[Same Great Product, NEW PACKAGING., MOISTURIZ...",[New Road Beauty Paraffin Wax is recommended f...,21.98,New Road Beauty,[],{'Package Dimensions': '10.5 x 6.4 x 1.6 inche...,,,4
3,1,B09JS339BZ,B09JS339BZ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2022-01-28 18:13:50.220,0,True,All Beauty,muaowig Ombre Body Wave Bundles 1B Grey Human ...,1.0,1,[?Hair Bundle Material?:Brazilian Virgin Human...,[Hair Material: Brazilian Virgin Human Hair Bu...,27.17,muaowig,[],"{'Brand': 'muaowig', 'Material': 'Human Hair',...",,,4
4,5,B08BZ63GMJ,B08BZ63GMJ,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,2020-12-30 10:02:43.534,0,True,All Beauty,Yinhua Electric Nail Drill Kit Portable Profes...,3.5,20,[],[],27.17,Yinhua,[],{'Package Dimensions': '8.5 x 3.82 x 2.24 inch...,,,4


In [5]:
# Count the number of interactions (reviews) per product and per cluster
cluster_popularity = (
    df_with_clusters.groupby(['cluster', 'parent_asin'])
    .size()
    .reset_index(name='interaction_count')
)

# Sort to have the most popular products at the top
cluster_popularity = cluster_popularity.sort_values(
    ['cluster', 'interaction_count'], ascending=[True, False])


# Recommendation function
def get_cluster_recommendations(user_id, n_recs=5):
    # Find the user's cluster
    user_row = user_features[user_features['user_id'] == user_id]
    if user_row.empty:
        return "Unknown User"

    user_cluster = user_row['cluster'].iloc[0]

    # Get products already purchased by the user to exclude them
    user_history = df_with_clusters[df_with_clusters['user_id'] == user_id][
        'parent_asin'].unique()

    # Get the most popular products from the cluster
    recommendations = cluster_popularity[
        cluster_popularity['cluster'] == user_cluster]

    # Filter out history
    recommendations = recommendations[
        ~recommendations['parent_asin'].isin(user_history)]

    # Take Top N
    top_n_asins = recommendations.head(n_recs)['parent_asin'].tolist()

    # Join with items_df to get titles
    top_n_details = items_df[items_df['parent_asin'].isin(top_n_asins)][
        ['parent_asin', 'title', 'main_category']]

    return top_n_details


# Test on a random user
test_user = df_with_clusters['user_id'].iloc[100]
print(
    f"Recommendations for user {test_user} (Cluster {user_features[user_features['user_id'] == test_user]['cluster'].iloc[0]}) :")
get_cluster_recommendations(test_user)

Recommendations for user AHV6QCNBJNSGLATP56JAWJ3C4G2A (Cluster 0) :


Unnamed: 0,parent_asin,title,main_category
302234,B00MIA0KGY,Partners,Digital Music
320654,B00NEJ7MMI,That's Christmas To Me,Digital Music
340850,B00KLF5J64,Guardians Of The Galaxy Mix V1,Digital Music
410621,B0000CD5FR,Eagles: The Very Best Of,Digital Music
453570,B00SWBLS3C,Traveller,Digital Music


In [6]:
test_user = df_with_clusters['user_id'].iloc[15030]
print(
    f"Recommendations for user {test_user} (Cluster {user_features[user_features['user_id'] == test_user]['cluster'].iloc[0]}) :")
get_cluster_recommendations(test_user)

Recommendations for user AEZB4NJZYK3FAWALZ2XEAPGXNRTA (Cluster 4) :


Unnamed: 0,parent_asin,title,main_category
117606,B07WTXWC32,Linda's Essentials Silicone Stove Gap Covers (...,Amazon Home
122369,B0B3DB5HTC,12 Pack Keurig Filter Replacement by K&J - Com...,Amazon Home
132149,B01KJ2FVFW,K&J 12-Pack of Cuisinart Compatible Replacemen...,Amazon Home
139042,B07RNJY499,iPartPlusMore Reusable Coffee Filters Compatib...,Amazon Home
153562,B000DLB2FI,Keurig My K-Cup Reusable Coffee Filter - Old M...,Amazon Home


In [9]:
# Analysis of cluster characteristics
cluster_summary = user_features.groupby('cluster').agg({
    'n_purchases': 'mean',
    'mean_rating': 'mean',
    'mean_helpful': 'mean',
    'log_helpful': 'mean',
    'mean_price': 'mean',
    'user_id': 'count'
}).rename(columns={'user_id': 'user_count'})

print(cluster_summary)

# To see the dominant category per cluster:
cat_cols = [c for c in user_features.columns if 'cat_share__' in c]
cluster_cats = user_features.groupby('cluster')[cat_cols].mean()
print(cluster_cats.idxmax(axis=1)) # Shows the top category for each cluster

         n_purchases  mean_rating  mean_helpful  log_helpful  mean_price  \
cluster                                                                    
0           3.073192     4.521859      1.130002     0.423156   21.770445   
1           1.254678     4.290048      0.401777     0.149931   36.609146   
2           1.199683     4.272495      1.540060     0.482653   46.042369   
3           1.159145     4.034240      1.833261     0.381055  152.022157   
4           1.113109     4.067267      0.778089     0.255375   28.542604   

         user_count  
cluster              
0            434881  
1            621144  
2              1893  
3            479336  
4           1143167  
cluster
0                cat_share__Digital Music
1     cat_share__Tools & Home Improvement
2    cat_share__Cell Phones & Accessories
3                   cat_share__Appliances
4                   cat_share__All Beauty
dtype: object
