# Customer Segmentation / Clustering

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

# Load the datasets
customers = pd.read_csv('./Customers.csv')
transactions = pd.read_csv('./Transactions.csv')

# Step 1: Data Preprocessing
# Merge the customers data with transactions data to get transaction and customer info
merged_data = transactions.merge(customers, on='CustomerID', how='inner')

# Summarizing customer spending and product interactions
customer_summary = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    average_spend_per_transaction=('TotalValue', 'mean')
).reset_index()

# Step 2: Feature Selection
# Selecting relevant features from customer and transaction data
features = customer_summary[['total_spend', 'transaction_count', 'average_spend_per_transaction']]

# Step 3: Standardizing the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Step 4: Clustering using KMeans (2 to 10 clusters)
# Trying clustering with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42)
customer_summary['Cluster'] = kmeans.fit_predict(scaled_features)

# Step 5: Clustering Evaluation using Davies-Bouldin Index
db_index = davies_bouldin_score(scaled_features, customer_summary['Cluster'])
print(f'Davies-Bouldin Index: {db_index}')

# Step 6: Visualizing the Clusters

# Reduce the dimensions for better visualization (PCA)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_features)

# Visualizing clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_components[:, 0], y=pca_components[:, 1], hue=customer_summary['Cluster'], palette="Set2")
plt.title('Customer Segmentation - KMeans Clustering')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.show()

# Step 7: Additional Clustering Metrics
# Optional: Silhouette Score (for clustering quality)
from sklearn.metrics import silhouette_score
sil_score = silhouette_score(scaled_features, customer_summary['Cluster'])
print(f'Silhouette Score: {sil_score}')

# Step 8: Cluster Center visualization
centroids = kmeans.cluster_centers_
pca_centroids = pca.transform(centroids)

# Visualize centroids on PCA plot
plt.scatter(pca_components[:, 0], pca_components[:, 1], c=customer_summary['Cluster'], cmap='viridis', alpha=0.5)
plt.scatter(pca_centroids[:, 0], pca_centroids[:, 1], s=300, c='red', marker='X', label='Centroids')
plt.title('Clusters and Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()
