Generally, I didn't make comments. I will probably update the kernel.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

np.random.seed(42)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering, AffinityPropagation, estimate_bandwidth, MeanShift, DBSCAN
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
df = pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")

df.sample(10)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.rename(columns = {"Annual Income (k$)": "AnnualIncome", "Spending Score (1-100)": "SpendingScore"}, inplace = True)

In [None]:
df

In [None]:
def feature_distribution(df, col):
    
    skewness = np.round(df[col].skew(), 3)
    kurtosis = np.round(df[col].kurtosis(), 3)

    fig, axes = plt.subplots(1, 3, figsize = (18, 6))
    
    sns.kdeplot(data = df, x = col, fill = True, ax = axes[0], color = "#603F83", linewidth=2)
    sns.boxplot(data = df, y = col, ax = axes[1], color = "#603F83",
                linewidth = 2, flierprops = dict(marker = "x", markersize = 3.5))
    stats.probplot(df[col], plot = axes[2])

    axes[0].set_title("Distribution \nSkewness: " + str(skewness) + "\nKurtosis: " + str(kurtosis))
    axes[1].set_title("Boxplot")
    axes[2].set_title("Probability Plot")
    fig.suptitle("For Feature:  " + col)
    
    for ax in axes:
        ax.set_facecolor("#C7D3D4FF")
        ax.grid(linewidth = 0.1)
    
    axes[2].get_lines()[0].set_markerfacecolor('#8157AE')
    axes[2].get_lines()[0].set_markeredgecolor('#603F83')
    axes[2].get_lines()[0].set_markeredgewidth(0.1)
    axes[2].get_lines()[1].set_color('#F1480F')
    axes[2].get_lines()[1].set_linewidth(3)
    
    sns.despine(top = True, right = True, left = True, bottom = True)
    plt.show()

In [None]:
for col in ["Age", "AnnualIncome", "SpendingScore"]:
    feature_distribution(df, col)

Normalizing features before applying clustering techniques is an important step.

In [None]:
for col in ["Age", "AnnualIncome", "SpendingScore"]:
    scaler = StandardScaler()
    df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

I choose only two features for model. I keep only "**AnnualIncome**" and "**SpendingScore**" features that are most related features for customer segmentation. And having two features gives a chance to create better visualizations and understand segmentations well.

In [None]:
X = df[["AnnualIncome", "SpendingScore"]]

In [None]:
palette = ["#280283", "#82005B", "#008B97", "#F1480F", "#9D9301",  "#4C00FF", 
           "#FF007B", "#00EAFF", "#9736FF", "#FFEE00", "#8992F3"]

# 1) KMeans

As we can see below, we can create lots of KMeans models with using different numbers of clusters. Well, how to choose optimum clusters?

We have two option.

In [None]:
fig, axes = plt.subplots(3, 3, figsize = (25, 15), sharex = True, sharey = True)
axes = axes.ravel()

for i in range(2, 11):
    
    kmeans = KMeans(n_clusters = i, random_state = 42) 
    kmeans.fit(X)
    cluster = kmeans.labels_
    
    sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster,
                    palette = palette[: len(set(cluster))], ax = axes[i-2],  edgecolor = None)
    
    sns.scatterplot(x = kmeans.cluster_centers_[:, 0], y = kmeans.cluster_centers_[:, 1], 
                    s = 200, color = "#C0EB00", label = "Centroids", marker = "X", edgecolor = "black", ax = axes[i-2])
    
    axes[i-2].set_facecolor("#C7D3D4")
    axes[i-2].legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    axes[i-2].set_title("KMeans with " + str(i) + " Clusters")
    fig.suptitle("Segmentation using KMeans \nwith different Clusters")
    
plt.tight_layout()

# Elbow Method

We will plot sum of squared distances w.r.t. cluster. As cluster numbers increase, sse tends to zero.

To choose right number of clusters, we think this plot like an arm. So, elbow point will be our number of clusters.

In [None]:
sse = {}

for k in range(1, 11):
    
    kmeans = KMeans(n_clusters = k, random_state = 42)
    kmeans.fit(X)
    sse[k] = kmeans.inertia_
    
    
fig, ax = plt.subplots(figsize = (12, 8))

ax.set_facecolor("#C7D3D4")
plt.title('The Elbow Method')
plt.xlabel('k')
plt.ylabel('SSE')

sns.pointplot(x = list(sse.keys()), y = list(sse.values()))

plt.show()

Elbow plot shows us 5 is optimal cluster number. Before 5, SSE decreases significantly when cluster number increase. After then 5, we have a little decrease on SSE when cluster numbers increase.

# Silhouette Coefficients

Silhouette coefficients or silhouette score is a metric that shows the quality of clustering.

[****](https://en.wikipedia.org/wiki/Silhouette_(clustering))The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). 

The silhouette ranges from −1 to +1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. 

If most objects have a high value, then the clustering configuration is appropriate. 

If many points have a low or negative value, then the clustering configuration may have too many or too few clusters.



So cluster number that have better silhouette score will be right choice.

In [None]:
silhouette_coefficients = {}

for k in range(2, 11):
    
    kmeans = KMeans(n_clusters=k, random_state = 42)
    kmeans.fit(X)
    silhouette_coefficients[k] = silhouette_score(X, kmeans.labels_)

In [None]:
fig, ax = plt.subplots(figsize = (12, 8))

ax.set_facecolor("#C7D3D4")

sns.pointplot(x = list(silhouette_coefficients.keys()), y = list(silhouette_coefficients.values()))

plt.title("Silhouette Scores")
plt.xlabel("k"); plt.ylabel("SSE")

plt.show()

If we look at above graph, 5 will be an optimal cluster number because it has best silhouette score.

In [None]:
for i in range(2, 11):
    
    fig, axes = plt.subplots(1, 2, figsize = (20, 6))

    km = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    km.fit(X)
    cluster = km.labels_
    
    visualizer = SilhouetteVisualizer(km, colors = palette[: len(set(cluster))], ax = axes[0])
    visualizer.fit(X)
    
    sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster,
                    palette = palette[: len(set(cluster))], ax = axes[1], edgecolor = None)
    
    sns.scatterplot(x = km.cluster_centers_[:, 0], y = km.cluster_centers_[:, 1],
                    s = 250, color = '#C0EB00', label = 'Centroids', marker = "X", ax = axes[1], edgecolor = "black")
    
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    
    axes[0].set_title("Silhouette Plot for " + str(i) + " Clusters" + "\nSilhouette score : " + str(silhouette_score(X, cluster).round(3)))
    axes[1].set_title("Customer Segmentations for " + str(i) + " Clusters" )    
    
    for ax in axes:
        ax.set_facecolor("#C7D3D4")

We chose the optimal cluster number. Let's create a model with 5 clusters and analyse it.

In [None]:
kmeans = KMeans(n_clusters = 5, random_state = 42) 
kmeans.fit(X)
cluster = kmeans.labels_

fig, ax = plt.subplots(figsize = (15, 10))

sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster, palette = palette[: len(set(cluster))])

sns.scatterplot(x = kmeans.cluster_centers_[:, 0], y = kmeans.cluster_centers_[:, 1],
                    s = 250, color = '#C0EB00', label = 'Centroids', marker = "X", ax = ax, edgecolor = "black")

ax.set_title("Segmentation with KMeans - 5 Clusters")
ax.set_facecolor("#C7D3D4")
plt.show()

Boxplots will help us to specify characteristics of clusters.

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (18, 7))

for ax in axes:
    ax.set_facecolor("#C7D3D4")
    ax.set_xlabel("Clusters")
    
sns.boxplot(x = cluster, y = "AnnualIncome", data = X, ax = axes[0])
sns.boxplot(x = cluster, y = "SpendingScore", data = X, ax = axes[1])


plt.show()

**Cluster 0** --> <font color='red'>Low</font> Income, <font color='green'>High</font> Spend

**Cluster 1** --> Mid Income, Mid Spend

**Cluster 2** --> <font color='green'>High</font> Income, <font color='red'>Low</font> Spend

**Cluster 3** --> <font color='red'>Low</font> Income, <font color='red'>Low</font> Spend

**Cluster 4** --> <font color='green'>High</font> Income, <font color='green'>High</font> Spend

# 2) Hierarchical clustering

https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering

[****](https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering)Hierarchical clustering is a general family of clustering algorithms that build nested clusters by merging or splitting them successively. 

This hierarchy of clusters is represented as a tree (or dendrogram). 

The root of the tree is the unique cluster that gathers all the samples, the leaves being the clusters with only one sample. 

# 2.1) Agglomerative Clustering

https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering

[****](https://scikit-learn.org/stable/modules/clustering.html#hierarchical-clustering) Agglomerative Clustering object performs a hierarchical clustering using a bottom up approach: each observation starts in its own cluster, and clusters are successively merged together. 

The linkage criteria determines the metric used for the merge strategy.

**Ward** minimizes the sum of squared differences within all clusters. It is a variance-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hierarchical approach.

**Maximum or complete** linkage minimizes the maximum distance between observations of pairs of clusters.

**Average** linkage minimizes the average of the distances between all observations of pairs of clusters.

**Single** linkage minimizes the distance between the closest observations of pairs of clusters.

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (20, 15), sharex = True, sharey = True)
axes = axes.ravel()


for i, link in enumerate(["ward", "complete", "average", "single"]):
    
    aggc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = link)  
    aggc.fit(X)
    cluster = aggc.labels_
    
    ax = axes[i]
    ax.set_facecolor("#C7D3D4")
    
    sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster, palette = palette[: len(set(cluster))], ax = ax)
    
    ax.set_title("Linkage Method: " + link.capitalize())
    ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    sns.despine(left = True, bottom = True)

If we look at above plots, we can see four different clustering. Even ward and complete clustering looks similar, but they are not same.

# 2.2) Dendrogram

*In hierarchical clustering, it illustrates the arrangement of the clusters produced by the corresponding analyses.

*Everitt, Brian (1998). Dictionary of Statistics. Cambridge, UK: Cambridge University Press. p. 96. ISBN 0-521-59346-8.

In [None]:
fig, ax = plt.subplots(figsize = (20, 10))

ax.set_facecolor("#C7D3D4")
ax.set_title("Dendrograms")  

dend = dendrogram(linkage(X, method = "ward"), labels = df.CustomerID.values)

# 3) Spectral Clustering

In [None]:
spectral = SpectralClustering(n_clusters = 5, random_state = 42, n_jobs = -1)

spectral.fit(X)

cluster = spectral.labels_

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
ax.set_facecolor("#C7D3D4")

sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster, palette = palette[: len(set(cluster))])

ax.set_title("Segmentation with Spectral Clustering - 5 Clusters")
plt.show()

# 4) Affinity Propagation

In [None]:
afp = AffinityPropagation(random_state = 42, max_iter = 500)

afp.fit(X)

cluster = afp.labels_

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
ax.set_facecolor("#C7D3D4")

sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster, 
                palette = palette[: len(set(cluster))], edgecolor = None, ax = ax)

sns.scatterplot(x = afp.cluster_centers_[:, 0], y = afp.cluster_centers_[:, 1],
                s = 250, color = '#C0EB00', label = 'Centroids', marker = "X", ax = ax, edgecolor = "black")

ax.set_title("Segmentation with Affinity Propagation")
plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)
plt.show()

# 5) Mean Shift

In [None]:
bandwidth = estimate_bandwidth(X, quantile = 0.1, random_state = 42, n_jobs = -1)

ms = MeanShift(bandwidth = bandwidth, n_jobs = -1)

ms.fit(X)

cluster = ms.labels_

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
ax.set_facecolor("#C7D3D4")

sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster, 
                palette = palette[: len(set(cluster))], edgecolor = None)

sns.scatterplot(x = ms.cluster_centers_[:, 0], y = ms.cluster_centers_[:, 1],
                s = 250, color = '#C0EB00', label = 'Centroids', marker = "X", edgecolor = "black")

ax.set_title("Segmentation with Mean Shift")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

# 6) DBSCAN

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN

[****](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN)**DBSCAN** - **Density-Based Spatial Clustering of Applications with Noise**

Finds core samples of high density and expands clusters from them. 

Good for data which contains clusters of similar density.



In [None]:
dbscan = DBSCAN(eps = 0.3, n_jobs = -1, min_samples = 5)

dbscan.fit(X)

cluster = dbscan.labels_

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
ax.set_facecolor("#C7D3D4")

sns.scatterplot(x = X["AnnualIncome"], y = X["SpendingScore"], hue = cluster, palette = palette[: len(set(cluster))])

ax.set_title("Segmentation with DBSCAN")
plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)
plt.show()