In [12]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
data = pd.read_csv('Mall_Customers.csv')
data.columns

Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')

In [15]:
def kmeans_cluster(data, n_clusters=4, random_state=42):
    features = data[['Gender','Age','Annual Income (k$)','Spending Score (1-100)']].copy()

    # Encode gender
    features['Gender'] = LabelEncoder().fit_transform(features['Gender'].astype(str))

    # Drop rows with missing values
    features = features.dropna()
    data = data.loc[features.index].reset_index(drop=True)

    # Scale
    X = StandardScaler().fit_transform(features)

    # Fit KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
    data['Cluster'] = kmeans.fit_predict(X)
    return data, kmeans



In [16]:
df_clusters, kmeans_model = kmeans_cluster(data, n_clusters=5)

In [17]:
print(df_clusters.head(), kmeans_model)

   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)  \
0           1    Male   19                  15                      39   
1           2    Male   21                  15                      81   
2           3  Female   20                  16                       6   
3           4  Female   23                  16                      77   
4           5  Female   31                  17                      40   

   Cluster  
0        3  
1        3  
2        3  
3        3  
4        3   KMeans(n_clusters=5, n_init=10, random_state=42)


In [18]:
cluster_summary = df_clusters.groupby('Cluster')[['Age','Annual Income (k$)','Spending Score (1-100)']].agg(['count','mean','std'])
print(cluster_summary)

          Age                      Annual Income (k$)                        \
        count       mean       std              count       mean        std   
Cluster                                                                       
0          39  32.692308  3.728650                 39  86.538462  16.312485   
1          29  36.482759  9.679214                 29  89.517241  17.418424   
2          43  49.813953  9.474919                 43  49.232558  15.601949   
3          54  24.907407  5.349197                 54  39.722222  16.981029   
4          35  55.714286  9.596480                 35  53.685714  18.714215   

        Spending Score (1-100)                        
                         count       mean        std  
Cluster                                               
0                           39  82.128205   9.364489  
1                           29  18.000000  10.579630  
2                           43  40.069767  15.555424  
3                           54  61.20