In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA 
from sklearn.metrics import silhouette_score
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, normalize

In [None]:
data = data.drop("CUST_ID", axis = 1)
data['CREDIT_LIMIT'] = data['CREDIT_LIMIT'].fillna((data['CREDIT_LIMIT'].mean()))
data['MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].fillna((data['MINIMUM_PAYMENTS'].mean()))

In [None]:
plt.figure(figsize=(9,7))

sns.heatmap(data.corr(),cmap='coolwarm')

plt.title('Correlation Matrix')

In [None]:
data_scaled = normalize(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
data_scaled.head()

In [None]:
X = data_scaled.iloc[:, [0, 3]].values

# 3 clusters

plt.figure(figsize=(18,9))
plt.subplot2grid ((2,3),(0,0))

hierarchical = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')  
y_kmeans = hierarchical.fit_predict(X)

plt.subplot2grid ((2,3),(0,0))

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'A Customers')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'B Customers')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'C Customers')

plt.title('3 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

plt.show()

# 4 clusters

plt.figure(figsize=(18,9))
plt.subplot2grid ((2,3),(0,1))

hierarchical = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')  
y_kmeans = hierarchical.fit_predict(X)

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'A Customers')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'B Customers')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'C Customers')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'olive', label = 'D Customers')

plt.title('4 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

# 5 clusters

plt.figure(figsize=(18,9))
plt.subplot2grid ((2,3),(0,2))

hierarchical = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')  
y_kmeans = hierarchical.fit_predict(X)

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Sensible Customers')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'A Customers')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Careful Customers')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'olive', label = 'B Customers')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'peru', label = 'C Customers')

plt.title('5 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))

In [None]:
X = data_scaled.iloc[:, [0, 3]].values


In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
silhouette_scores = [] 

for n_cluster in range(2, 11):
    silhouette_scores.append(silhouette_score(data_scaled, KMeans(n_clusters = n_cluster).fit_predict(data_scaled))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7,8,9,10] 
plt.bar(k, silhouette_scores) 
plt.xlabel('Number of clusters', fontsize = 10) 
plt.ylabel('Silhouette Score', fontsize = 10) 
plt.show()

In [None]:
# 3 clusters

kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

plt.figure(figsize=(20,15))

plt.subplot2grid ((2,3),(0,0))

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Sensible Customers')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Careful Customers')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'A Customers')

plt.title('3 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label = 'Centroids')


# 4 clusters

kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

plt.subplot2grid ((2,3),(0,1))

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Careful Customers')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Sensible Customers')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'A Customers')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'olive', label = 'B Customers')

plt.title('4 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label = 'Centroids')

# 5 clusters

kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

plt.subplot2grid ((2,3),(0,2))

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Sensible Customers')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'A Customers')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'B Customers')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'olive', label = 'C Customers')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'peru', label = 'Careful Customers')


plt.title('5 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label = 'Centroids')


plt.show()

In [None]:
hierarchical_ = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward').fit_predict(X)
kmeans_ = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42).fit_predict(X)

In [None]:
kmeansSilhouette_Score = metrics.silhouette_score(X, kmeans_, metric='euclidean')
Hierarchical_Silhouette_Score = metrics.silhouette_score(X, hierarchical_, metric='euclidean')

In [None]:
Clustering_Silhouette_Scores = [ ['KMeans',kmeansSilhouette_Score],['Hierarchical',Hierarchical_Silhouette_Score]]
Clustering_Silhouette_Scores = pd.DataFrame(Clustering_Silhouette_Scores, columns=['Clustering Method', 'Silhouette Score']) 
Clustering_Silhouette_Scores.sort_values(by='Silhouette Score', ascending= False)

In [None]:
# 3 clusters

spectral = SpectralClustering(n_clusters=3, affinity="nearest_neighbors", assign_labels='discretize',random_state=40)
y_spectral = spectral.fit_predict(X)

plt.figure(figsize=(20,15))

plt.subplot2grid ((2,3),(0,0))

plt.scatter(X[y_spectral == 0, 0], X[y_spectral == 0, 1], s = 100, c = 'red', label = 'A Customers')
plt.scatter(X[y_spectral == 1, 0], X[y_spectral == 1, 1], s = 100, c = 'blue', label = 'B Customers')
plt.scatter(X[y_spectral == 2, 0], X[y_spectral == 2, 1], s = 100, c = 'green', label = 'C Customers')

plt.title('3 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

# 4 clusters

spectral = SpectralClustering(n_clusters=4, affinity="nearest_neighbors", assign_labels='discretize',random_state=40)
y_spectral = spectral.fit_predict(X)


plt.subplot2grid ((2,3),(0,1))

plt.scatter(X[y_spectral == 0, 0], X[y_spectral == 0, 1], s = 100, c = 'red', label = 'A Customers')
plt.scatter(X[y_spectral == 1, 0], X[y_spectral == 1, 1], s = 100, c = 'blue', label = 'B Customers')
plt.scatter(X[y_spectral == 2, 0], X[y_spectral == 2, 1], s = 100, c = 'green', label = 'C Customers')
plt.scatter(X[y_spectral == 3, 0], X[y_spectral == 3, 1], s = 100, c = 'olive', label = 'D Customers')

plt.title('4 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

# 5 clusters

spectral = SpectralClustering(n_clusters=5, affinity="nearest_neighbors", assign_labels='discretize',random_state=40)
y_spectral = spectral.fit_predict(X)


plt.subplot2grid ((2,3),(0,2))

plt.scatter(X[y_spectral == 0, 0], X[y_spectral == 0, 1], s = 100, c = 'red', label = 'Sensible Customers')
plt.scatter(X[y_spectral == 1, 0], X[y_spectral == 1, 1], s = 100, c = 'blue', label = 'A Customers')
plt.scatter(X[y_spectral == 2, 0], X[y_spectral == 2, 1], s = 100, c = 'green', label = 'B Customers')
plt.scatter(X[y_spectral == 3, 0], X[y_spectral == 3, 1], s = 100, c = 'olive', label = 'C Customers')
plt.scatter(X[y_spectral == 4, 0], X[y_spectral == 4, 1], s = 100, c = 'peru', label = 'D Customers')


plt.title('5 Clusters of Customers')
plt.xlabel('PURCHASES')
plt.ylabel('ONEOFF_PURCHAS')
plt.legend()

plt.show()

In [None]:
hierarchical_ = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward').fit_predict(X)
kmeans_ = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42).fit_predict(X)
spectral_ = SpectralClustering(n_clusters=3, affinity="nearest_neighbors", assign_labels='discretize',
                                      random_state=40).fit_predict(data_scaled)

In [None]:
kmeansSilhouette_Score = metrics.silhouette_score(X, kmeans_, metric='euclidean')
Hierarchical_Silhouette_Score = metrics.silhouette_score(X, hierarchical_, metric='euclidean')
Spectral_Silhouette_Score = metrics.silhouette_score(X, spectral_, metric='euclidean')

In [None]:
Clustering_Silhouette_Scores = [ ['KMeans',kmeansSilhouette_Score],['Hierarchical',Hierarchical_Silhouette_Score], ['Spectral', Spectral_Silhouette_Score]]
Clustering_Silhouette_Scores = pd.DataFrame(Clustering_Silhouette_Scores, columns=['Clustering Method', 'Silhouette Score']) 
Clustering_Silhouette_Scores.sort_values(by='Silhouette Score', ascending= False)