# Clustering Analysis - Customer Segmentation

This notebook performs customer segmentation using clustering techniques on a synthetic dataset with features similar to the provided task description.

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Generate synthetic dataset
np.random.seed(42)
n_customers = 200
customer_ids = np.arange(1, n_customers + 1)
ages = np.random.randint(18, 70, size=n_customers)
annual_income = np.random.randint(15000, 120000, size=n_customers)
spending_score = np.random.randint(1, 100, size=n_customers)

df = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': ages,
    'AnnualIncome': annual_income,
    'SpendingScore': spending_score
})

# Display first few rows
df.head()

In [None]:
# Data preprocessing
features = ['Age', 'AnnualIncome', 'SpendingScore']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

# Determine optimal number of clusters using Elbow Method
wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Silhouette scores for different k
sil_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    sil_scores.append(silhouette_score(X_scaled, labels))

plt.figure(figsize=(8,5))
plt.plot(range(2, 11), sil_scores, marker='o', color='blue')
plt.title('Silhouette Scores for Different k')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
# From the plots, choose optimal k (e.g., 4)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

# PCA for 2D visualization
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)
df['PCA1'] = components[:, 0]
df['PCA2'] = components[:, 1]

# Plot clusters
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='PCA1', y='PCA2', hue='Cluster', palette='Set2', s=60)
plt.title('Customer Segments (PCA Visualization)')
plt.show()

In [None]:
# Pairplot to visualize feature relationships within clusters
sns.pairplot(df, vars=features, hue='Cluster', palette='Set2')

In [None]:
# Centroid visualization
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
centroid_df = pd.DataFrame(centroids, columns=features)
centroid_df['Cluster'] = range(optimal_k)

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='AnnualIncome', y='SpendingScore', hue='Cluster', palette='Set2', s=60)
sns.scatterplot(data=centroid_df, x='AnnualIncome', y='SpendingScore', hue='Cluster', palette='Set2', s=200, marker='X', legend=False)
plt.title('Clusters and Centroids')
plt.show()

## Recommendations
- Target marketing campaigns to specific customer segments based on their spending behavior and income.
- Develop loyalty programs for high-spending clusters.
- Tailor product offerings to different age and income groups.
- Use cluster insights for resource allocation and strategic planning.