In [None]:
# 1. Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 2. Load processed data
df = pd.read_csv("../data/processed/customer_features.csv")

# 3. Visualize distributions
sns.histplot(df['income'])
plt.title("Income Distribution")
plt.show()

# 4. Correlation heatmap
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# 5. Normalize & PCA
X = df[['age', 'gender_encoded', 'income', 'avg_spent', 'total_txn', 'region_encoded']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 6. Elbow Method
inertias = []
for k in range(2, 10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_pca)
    inertias.append(km.inertia_)

plt.plot(range(2, 10), inertias, marker='o')
plt.title("Elbow Curve")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

# 7. KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# 8. Silhouette Score
score = silhouette_score(X_pca, clusters)
print(f"Silhouette Score: {score:.2f}")

# 9. Visualize clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='tab10')
plt.title("Customer Segments")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid()
plt.show()

# 10. Save segmented data
df['cluster'] = clusters
df.to_csv("../data/processed/segmented_customers.csv", index=False)
