In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA

# Load the dataset
df = pd.read_csv('customer.csv')

# Drop rows with missing income
df.dropna(subset=['Income'], inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
df['Education'] = encoder.fit_transform(df['Education'])
df['Marital_Status'] = encoder.fit_transform(df['Marital_Status'])
df['Dt_Customer'] = encoder.fit_transform(df['Dt_Customer'])

# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Perform PCA for dimensionality reduction
pca = PCA(n_components=2)  # Reduce to 2 components for easier visualization
df_pca = pca.fit_transform(df_scaled)

# Elbow Method to find optimal clusters
wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10, max_iter=300)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

# Plotting the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title("Elbow Method for Optimal Clusters", fontsize=16)
plt.xlabel("Number of Clusters (k)", fontsize=12)
plt.ylabel("WCSS (Inertia)", fontsize=12)
plt.grid()
plt.show()

# Test a range of cluster numbers for silhouette score and Davies-Bouldin Score
best_silhouette = -1
best_k = None

silhouette_scores_table = []

for n_clusters in range(2, 11):  # Testing clusters from 2 to 10
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=300)
    cluster_labels = kmeans.fit_predict(df_pca)
    silhouette_avg = silhouette_score(df_pca, cluster_labels)
    db_score = davies_bouldin_score(df_pca, cluster_labels)
    silhouette_scores_table.append({"Number of Clusters": n_clusters, 
                                    "Silhouette Score": round(silhouette_avg, 3), 
                                    "Davies-Bouldin Score": round(db_score, 3)})
    
    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_k = n_clusters

print(best_k)

# Create a DataFrame to display the tabulated silhouette and Davies-Bouldin scores
silhouette_scores_df = pd.DataFrame(silhouette_scores_table)

# Display the tabulated scores in the console
print(silhouette_scores_df)

# Apply K-Means with the optimal number of clusters found
kmeans_optimized = KMeans(n_clusters=best_k, random_state=42, n_init=10, max_iter=300)
df['Optimized_Cluster'] = kmeans_optimized.fit_predict(df_pca)

# Silhouette score for 4 clusters
kmeans_4_clusters = KMeans(n_clusters=4, random_state=42, n_init=10, max_iter=300)
df['Cluster_4'] = kmeans_4_clusters.fit_predict(df_pca)
silhouette_score_4_clusters = silhouette_score(df_pca, df['Cluster_4'])
davies_bouldin_score_4_clusters = davies_bouldin_score(df_pca, df['Cluster_4'])

print(f"Best Cluster Score: {silhouette_score_4_clusters}")
print(f"Davies-Bouldin Score: {davies_bouldin_score_4_clusters}")

# Perform PCA for dimensionality reduction
pca = PCA(n_components=5)  # Reduce to 2 components for easier visualization
df_pca = pca.fit_transform(df_scaled)

# Apply K-Means with the optimal number of clusters found
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df['Cluster_4'] = kmeans.fit_predict(df_pca)

# Visualize Clusters with PCA
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df_pca[:, 0], y=df_pca[:, 1], hue=df['Cluster_4'], palette="Set2", s=100)

# Add custom legend descriptions
cluster_descriptions = {
    0: "Moderate Spenders",
    1: "Luxury Shoppers",
    2: "Digital-Native Bargain Seekers",
    3: "Budget-Conscious Families"
}
handles, labels = plt.gca().get_legend_handles_labels()
updated_labels = [f"{label} - {cluster_descriptions[int(label)]}" for label in labels if label.isdigit()]
plt.legend(handles, updated_labels, title="Cluster", fontsize=9, title_fontsize=12)

plt.title("Customer Segments (PCA)", fontsize=16)
plt.xlabel("Principal Component 1", fontsize=12)
plt.ylabel("Principal Component 2", fontsize=12)
plt.grid(True)
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'customer.csv'

In [55]:
pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0
Note: you may need to restart the kernel to use updated packages.
