# Clustering ‚Äì Customer Segmentation (RFM + KMeans)

In [None]:
import sys, os, warnings
warnings.filterwarnings("ignore")

# project root
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, ROOT)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.config import load_config
from src.features.rfm import build_rfm
from src.mining.clustering import (
    cap_outliers_iqr,
    scale_rfm,
    elbow_scores,
    train_kmeans,
    assign_clusters,
    cluster_stats,
    label_clusters,
    map_segment_names,
    save_model,
)

cfg = load_config(os.path.join(ROOT, "configs", "params.yaml"))
seed = cfg.get("seed", 42)

# Load cleaned data
df = pd.read_parquet(os.path.join(ROOT, "data/processed/cleaned.parquet"))
print("Shape:", df.shape)
df.head()

## 1. Build RFM Features

In [None]:
# Build RFM
rfm = build_rfm(df)
print(f"RFM Shape: {rfm.shape}")
print(f"S·ªë kh√°ch h√†ng: {len(rfm)}")
print()
print("üìä RFM Statistics:")
display(rfm.describe().round(2))
rfm.head(10)

## 2. Kh√°m ph√° RFM Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Recency
axes[0].hist(rfm["Recency"], bins=30, color="steelblue", edgecolor="white")
axes[0].set_title("Recency Distribution")
axes[0].set_xlabel("Days since last purchase")
axes[0].axvline(rfm["Recency"].median(), color="red", linestyle="--", label=f"Median={rfm['Recency'].median():.0f}")
axes[0].legend()

# Frequency
axes[1].hist(rfm["Frequency"], bins=20, color="seagreen", edgecolor="white")
axes[1].set_title("Frequency Distribution")
axes[1].set_xlabel("Number of orders")
axes[1].axvline(rfm["Frequency"].median(), color="red", linestyle="--", label=f"Median={rfm['Frequency'].median():.0f}")
axes[1].legend()

# Monetary
axes[2].hist(rfm["Monetary"], bins=30, color="coral", edgecolor="white")
axes[2].set_title("Monetary Distribution")
axes[2].set_xlabel("Total spend ($)")
axes[2].axvline(rfm["Monetary"].median(), color="red", linestyle="--", label=f"Median=${rfm['Monetary'].median():,.0f}")
axes[2].legend()

plt.tight_layout()
plt.show()

print("\nüí° Nh·∫≠n x√©t:")
print("  - Recency: Ph√¢n ph·ªëi l·ªách ph·∫£i, ƒëa s·ªë kh√°ch mua g·∫ßn ƒë√¢y")
print("  - Frequency: Ph√¢n ph·ªëi t∆∞∆°ng ƒë·ªëi chu·∫©n, trung b√¨nh ~6 orders")
print("  - Monetary: C√≥ outliers (kh√°ch chi ti√™u r·∫•t cao)")

## 3. X·ª≠ l√Ω Outliers + Scale d·ªØ li·ªáu

In [None]:
# Cap outliers using IQR
rfm_capped = cap_outliers_iqr(rfm, cols=["Recency", "Frequency", "Monetary"])

print("üìä So s√°nh tr∆∞·ªõc/sau Cap Outliers:")
print("\nTr∆∞·ªõc (max values):")
print(rfm[["Recency", "Frequency", "Monetary"]].max())
print("\nSau cap (max values):")
print(rfm_capped[["Recency", "Frequency", "Monetary"]].max())

# Scale RFM
rfm_scaled, scaler = scale_rfm(rfm_capped, cols=["Recency", "Frequency", "Monetary"])
X = rfm_scaled[["Recency", "Frequency", "Monetary"]].values

print("\n‚úÖ Scaled data:")
print(f"   Mean ~ 0: {X.mean(axis=0).round(2)}")
print(f"   Std  ~ 1: {X.std(axis=0).round(2)}")

## 4. Elbow Method + Silhouette Score

In [None]:
# T√≠nh elbow scores
scores = elbow_scores(X, k_range=range(2, 11), random_state=seed)

# V·∫Ω bi·ªÉu ƒë·ªì
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow (Inertia)
axes[0].plot(scores["k"], scores["inertia"], "bo-", linewidth=2, markersize=8)
axes[0].set_xlabel("S·ªë cluster (k)")
axes[0].set_ylabel("Inertia (SSE)")
axes[0].set_title("Elbow Method - T√¨m ƒëi·ªÉm khu·ª∑u tay")
axes[0].grid(True, alpha=0.3)

# Silhouette
axes[1].plot(scores["k"], scores["silhouette"], "go-", linewidth=2, markersize=8)
axes[1].set_xlabel("S·ªë cluster (k)")
axes[1].set_ylabel("Silhouette Score")
axes[1].set_title("Silhouette Analysis - ƒêi·ªÉm cao = t√°ch bi·ªát t·ªët")
axes[1].grid(True, alpha=0.3)

plt.tight_layout()

# Save
figures_dir = os.path.join(ROOT, "outputs", "figures")
os.makedirs(figures_dir, exist_ok=True)
fig.savefig(os.path.join(figures_dir, "elbow.png"), dpi=150)
print("‚úÖ Saved outputs/figures/elbow.png")
plt.show()

# In b·∫£ng scores
print("\nüìä Scores:")
scores_df = pd.DataFrame(scores)
scores_df["inertia"] = scores_df["inertia"].round(0)
scores_df["silhouette"] = scores_df["silhouette"].round(4)
display(scores_df)

# G·ª£i √Ω k
best_k = scores_df.loc[scores_df["silhouette"].idxmax(), "k"]
print(f"\nüí° Silhouette cao nh·∫•t t·∫°i k={best_k}")
print("   ‚Üí Ch·ªçn k=4 (theo config) ho·∫∑c k d·ª±a tr√™n business context")

## 5. Hu·∫•n luy·ªán KMeans

In [None]:
n_clusters = cfg.get("clustering", {}).get("n_clusters", 4)
print(f"üîß Hu·∫•n luy·ªán KMeans v·ªõi k={n_clusters}")

km = train_kmeans(X, n_clusters=n_clusters, random_state=seed)
labels = km.labels_

print(f"\n‚úÖ KMeans trained!")
print(f"   Inertia: {km.inertia_:.2f}")
print(f"   Iterations: {km.n_iter_}")
print(f"\n   Cluster distribution:")
print(pd.Series(labels).value_counts().sort_index())

## 6. G√°n nh√£n cluster + Th·ªëng k√™

In [None]:
# G√°n clusters
rfm_clustered = assign_clusters(rfm_capped, labels)

# T√≠nh stats
stats = cluster_stats(rfm_clustered)
stats = label_clusters(stats)

print("üìä Th·ªëng k√™ theo Cluster:")
display(stats)

# Map segment names
rfm_final = map_segment_names(rfm_clustered, stats)

# Save
tables_dir = os.path.join(ROOT, "outputs", "tables")
os.makedirs(tables_dir, exist_ok=True)
stats.to_csv(os.path.join(tables_dir, "cluster_stats.csv"), index=False)
rfm_final.to_csv(os.path.join(tables_dir, "rfm_clustered.csv"), index=False)
print("\n‚úÖ Saved outputs/tables/cluster_stats.csv")
print("‚úÖ Saved outputs/tables/rfm_clustered.csv")

## 7. Visualize Clusters

In [None]:
# Cluster scatter: Frequency vs Monetary
fig, ax = plt.subplots(figsize=(10, 7))

for seg in stats["Segment"].unique():
    subset = rfm_final[rfm_final["Segment"] == seg]
    ax.scatter(
        subset["Frequency"],
        subset["Monetary"],
        label=seg,
        alpha=0.6,
        s=50,
        edgecolors="white",
        linewidths=0.5,
    )

# Centroids
centers = scaler.inverse_transform(km.cluster_centers_)
ax.scatter(
    centers[:, 1],  # Frequency
    centers[:, 2],  # Monetary
    c="red",
    marker="X",
    s=200,
    edgecolors="black",
    linewidths=2,
    label="Centroids",
)

ax.set_xlabel("Frequency (s·ªë ƒë∆°n h√†ng)", fontsize=12)
ax.set_ylabel("Monetary (t·ªïng chi ti√™u $)", fontsize=12)
ax.set_title("Customer Segments (Frequency vs Monetary)", fontsize=14)
ax.legend()
plt.tight_layout()

fig.savefig(os.path.join(figures_dir, "cluster_scatter.png"), dpi=150)
print("‚úÖ Saved outputs/figures/cluster_scatter.png")
plt.show()

## 8. Revenue by Segment

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

stats_sorted = stats.sort_values("Monetary_sum", ascending=True)
colors = sns.color_palette("viridis", n_colors=len(stats_sorted))

# Count
axes[0].barh(stats_sorted["Segment"], stats_sorted["Count"], color=colors)
axes[0].set_xlabel("S·ªë kh√°ch h√†ng")
axes[0].set_title("S·ªë l∆∞·ª£ng kh√°ch h√†ng theo Segment")
for i, (cnt, pct) in enumerate(zip(stats_sorted["Count"], stats_sorted["Pct"])):
    axes[0].text(cnt + 5, i, f"{cnt} ({pct}%)", va="center", fontsize=10)

# Revenue
axes[1].barh(stats_sorted["Segment"], stats_sorted["Monetary_sum"], color=colors)
axes[1].set_xlabel("T·ªïng doanh thu ($)")
axes[1].set_title("Doanh thu theo Segment")
for i, rev in enumerate(stats_sorted["Monetary_sum"]):
    axes[1].text(rev + 1000, i, f"${rev:,.0f}", va="center", fontsize=10)

plt.tight_layout()
fig.savefig(os.path.join(figures_dir, "revenue_by_cluster.png"), dpi=150)
print("‚úÖ Saved outputs/figures/revenue_by_cluster.png")
plt.show()

## 9. Ph√¢n t√≠ch Insight Marketing

In [None]:
print("=" * 70)
print("üìå PH√ÇN T√çCH INSIGHT MARKETING ‚Äì CUSTOMER SEGMENTATION")
print("=" * 70)

for _, row in stats.iterrows():
    print(f"\nüéØ {row['Segment']} (Cluster {row['Cluster']})")
    print(f"   S·ªë kh√°ch    : {row['Count']} ({row['Pct']}%)")
    print(f"   Doanh thu   : ${row['Monetary_sum']:,.0f}")
    print(f"   Recency avg : {row['Recency_mean']:.0f} ng√†y")
    print(f"   Frequency   : {row['Frequency_mean']:.1f} ƒë∆°n/kh√°ch")
    print(f"   Monetary avg: ${row['Monetary_mean']:,.0f}")

print("\n" + "=" * 70)
print("üí° KHUY·∫æN NGH·ªä MARKETING:")
print("=" * 70)
print("""
üåü VIP:
   - ∆Øu ƒë√£i ri√™ng, ch∆∞∆°ng tr√¨nh loyalty cao c·∫•p
   - Personal shopping assistant
   - Early access s·∫£n ph·∫©m m·ªõi

üíé Loyal:
   - Cross-sell s·∫£n ph·∫©m premium
   - Reward points multiplier
   - Referral program

üå± Potential:
   - Upsell bundle deals
   - Email nurture campaign
   - Limited-time offers

‚ö†Ô∏è Lost/At-Risk:
   - Win-back campaign v·ªõi discount l·ªõn
   - Survey t√¨m hi·ªÉu l√Ω do r·ªùi b·ªè
   - Re-engagement email series
""")
print("=" * 70)

## 10. Save Model

In [None]:
# Save KMeans model
models_dir = os.path.join(ROOT, "outputs", "models")
os.makedirs(models_dir, exist_ok=True)

save_model(km, os.path.join(models_dir, "kmeans.pkl"))
print("‚úÖ Saved outputs/models/kmeans.pkl")

print("\n" + "=" * 50)
print("üì¶ T·∫§T C·∫¢ OUTPUT ƒê√É T·∫†O:")
print("=" * 50)
print("Tables:")
print("  - outputs/tables/cluster_stats.csv")
print("  - outputs/tables/rfm_clustered.csv")
print("\nFigures:")
print("  - outputs/figures/elbow.png")
print("  - outputs/figures/cluster_scatter.png")
print("  - outputs/figures/revenue_by_cluster.png")
print("\nModels:")
print("  - outputs/models/kmeans.pkl")