In [6]:
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
df = pd.read_csv("../training_data/recommendation_training_data.csv")
df.head()


Unnamed: 0,customer_id,name,age,gender,occupation,total_spent,order_count,tags,last_purchase_days,churn_probability
0,CUST0001,Customer_1,43,Other,Entrepreneur,26596.1,6,"premium,tech",40,0.9
1,CUST0002,Customer_2,74,Other,Doctor,120.4,13,"loyal,fashion",176,0.51
2,CUST0003,Customer_3,39,Other,Student,29760.68,5,"premium,creative",345,0.95
3,CUST0004,Customer_4,17,Male,Teacher,12339.58,19,"new,creative",317,0.58
4,CUST0005,Customer_5,72,Other,Teacher,15590.79,6,"budget,tech",184,0.3


In [7]:
features = ["age", "gender", "occupation", "total_spent", "order_count", 
            "last_purchase_days", "tags", "churn_probability"]

X = df[features]
X.head()

Unnamed: 0,age,gender,occupation,total_spent,order_count,last_purchase_days,tags,churn_probability
0,43,Other,Entrepreneur,26596.1,6,40,"premium,tech",0.9
1,74,Other,Doctor,120.4,13,176,"loyal,fashion",0.51
2,39,Other,Student,29760.68,5,345,"premium,creative",0.95
3,17,Male,Teacher,12339.58,19,317,"new,creative",0.58
4,72,Other,Teacher,15590.79,6,184,"budget,tech",0.3


In [8]:
numeric_features = ["age", "total_spent", "order_count", "last_purchase_days", "churn_probability"]
categorical_features = ["gender", "occupation", "tags"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [9]:
kmeans = KMeans(n_clusters=5, random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("cluster", kmeans)
])

# Train model
pipeline.fit(X)

# Assign cluster labels
df["cluster"] = pipeline.predict(X)
df.head()

Unnamed: 0,customer_id,name,age,gender,occupation,total_spent,order_count,tags,last_purchase_days,churn_probability,cluster
0,CUST0001,Customer_1,43,Other,Entrepreneur,26596.1,6,"premium,tech",40,0.9,2
1,CUST0002,Customer_2,74,Other,Doctor,120.4,13,"loyal,fashion",176,0.51,3
2,CUST0003,Customer_3,39,Other,Student,29760.68,5,"premium,creative",345,0.95,2
3,CUST0004,Customer_4,17,Male,Teacher,12339.58,19,"new,creative",317,0.58,3
4,CUST0005,Customer_5,72,Other,Teacher,15590.79,6,"budget,tech",184,0.3,1


In [None]:
recommendation_map = {}

for cluster_id in sorted(df["cluster"].unique()):
    cluster_data = df[df["cluster"] == cluster_id]
    avg_spent = cluster_data["total_spent"].mean()
    avg_orders = cluster_data["order_count"].mean()
    avg_churn = cluster_data["churn_probability"].mean()

    if avg_churn > 0.6:
        recommendation = "High churn risk: Give retention offers (discounts, free shipping)"
    elif avg_spent > 15000:
        recommendation = "Offer loyalty rewards + early access to premium products"
    elif avg_orders == 0:
        recommendation = "Send onboarding discount + starter bundle offer"
    else:
        recommendation = "Recommend popular products + referral coupon"

    recommendation_map[int(cluster_id)] = recommendation

✅ Recommendation Map: {0: 'Offer loyalty rewards + early access to premium products', 1: 'Offer loyalty rewards + early access to premium products', 2: 'High churn risk: Give retention offers (discounts, free shipping)', 3: 'High churn risk: Give retention offers (discounts, free shipping)', 4: 'Offer loyalty rewards + early access to premium products'}


In [None]:
with open("../models/recommender_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

with open("../models/recommendation_map.json", "w") as f:
    json.dump(recommendation_map, f, indent=4)

✅ Saved recommender_pipeline.pkl and recommendation_map.json


In [None]:
sample_customer = pd.DataFrame([{
    "age": 100,
    "gender": "Female",
    "occupation": "Software Engineer",
    "total_spent": 1000,
    "order_count": 1,
    "last_purchase_days": 100,
    "tags": "premium,tech",
    "churn_probability": 0.9
}])

cluster_id = pipeline.predict(sample_customer)[0]
print("Cluster ID:", cluster_id)
print("Recommended Action:", recommendation_map[cluster_id])


Cluster ID: 2
Recommended Action: High churn risk: Give retention offers (discounts, free shipping)
