In [24]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, Birch
from sklearn.metrics import jaccard_score, adjusted_rand_score, calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [25]:
X = 'mall customers.csv'

In [26]:
# Load data
df = pd.read_csv(X)

In [27]:
df.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [28]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
None


In [29]:
df.rename(columns={'Genre': 'Gender'}, inplace= True)

In [32]:
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,0,19,15,39
1,2,0,21,15,81
2,3,1,20,16,6
3,4,1,23,16,77
4,5,1,31,17,40


In [31]:
df.Gender = [1 if each == "Female" else 0 for each in df.Gender]


In [33]:
features = df[['Age', 'Spending Score (1-100)', 'Annual Income (k$)']]

In [34]:
scaler = StandardScaler()
s_data = scaler.fit_transform(df)

In [35]:
PCA = PCA(n_components = 2)
re_data = PCA.fit_transform(s_data)

In [36]:
DBscan = DBSCAN(eps= 0.5, min_samples= 5)
Birch = Birch(n_clusters= 2)
agglo = AgglomerativeClustering(n_clusters= 2)
kmean = KMeans(n_clusters= 2, random_state= 42)

In [37]:
DB_label = DBscan.fit_predict(s_data)
birch_label = Birch.fit_predict(s_data)
agglo_label = agglo.fit_predict(s_data)
kmean_label = kmean.fit_predict(s_data)

In [38]:
sil = [silhouette_score(s_data, predicted_labels) for predicted_labels in [birch_label, DB_label, agglo_label, kmean_label]]
dbscore = [davies_bouldin_score(s_data, predicted_labels) for predicted_labels in [birch_label, DB_label, agglo_label, kmean_label]]

In [39]:
print("Clustering Methods    : birch_label, DB_label, agglo_label, kmean_label")
print("Silhouette Scores     :", sil)
print("Davies Bouldin Score:", dbscore)

Clustering Methods    : birch_label, DB_label, agglo_label, kmean_label
Silhouette Scores     : [0.1940733413172533, -0.128701372094438, 0.2511952763244898, 0.25418996301027574]
Davies Bouldin Score: [1.8460238296285925, 1.5094575126273528, 1.5635131897716499, 1.5412318706224073]
