Importing Libraries.


In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
py.offline.init_notebook_mode(connected=True)
from sklearn.cluster import KMeans
import warnings
import os
%matplotlib inline
warnings.filterwarnings("ignore")
py.offline.init_notebook_mode(connected = True)

Data Exploration

In [2]:
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

In [3]:
df.info()

In [4]:
df.describe().T

In [5]:
df.nunique()

Data Visualization

In [6]:
# Add histogram data
x1 = df['Age']
x2 = df['Annual Income (k$)']
x3 = df['Spending Score (1-100)']
# Group data together
hist_data = [x1, x2, x3]
group_labels = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']

rug_text_one = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
                'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
                'u', 'v', 'w', 'x', 'y', 'z']

rug_text_two = ['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg', 'hh', 'ii', 'jj',
                'kk', 'll', 'mm', 'nn', 'oo', 'pp', 'qq', 'rr', 'ss', 'tt',
                'uu', 'vv', 'ww', 'xx', 'yy', 'zz']

rug_text_three = ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'fff', 'ggg', 'hhh', 'iii', 'jjj',
                'kkk', 'lll', 'mmm', 'nnn', 'ooo', 'ppp', 'qqq', 'rrr', 'sss', 'ttt',
                'uuu', 'vvv', 'www', 'xxx', 'yyy', 'zzz']

rug_text = [rug_text_one, rug_text_two, rug_text_three] # for hover in rug plot
colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)','rgb(0, 0, 200)']

# Create distplot with custom bin_size
fig = ff.create_distplot(
    [x1, x2 ,x3], group_labels, bin_size=3,
    rug_text=rug_text, colors=colors)

fig.update_layout(title_text='Customized Distplot')
fig.show()

In [7]:
plt.figure(1 , figsize = (15 , 5))
sns.set(style="darkgrid")
ax = sns.countplot(y="Gender", data=df)

In [8]:
fig = px.scatter_matrix(df,
    dimensions=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'],
    color="Gender")
fig.show()

In [9]:
fig = px.scatter(df, x="Age", y="Annual Income (k$)", color="Gender", size='Age' )
fig.show()

In [10]:
df["e"] = df["Annual Income (k$)"]/100
fig = px.scatter(df, x="Annual Income (k$)", y="Spending Score (1-100)", color="Gender",
                 error_x="e", error_y="e")
fig.show()

In [11]:
df.drop(['e','CustomerID'], axis=1, inplace=True)
df.head()

In [12]:
fig = px.violin(df, y="Gender", x="Age", color="Gender", box=True, points="all",
          hover_data=df.columns)
fig.show()

In [13]:
fig = px.violin(df, y="Gender", x="Annual Income (k$)", color="Gender", box=True, points="all",
          hover_data=df.columns)
fig.show()

In [14]:
fig = px.violin(df, y="Gender", x="Spending Score (1-100)", color="Gender", box=True, points="all",
          hover_data=df.columns)
fig.show()

In [15]:
X = df.iloc[:,2:4].values

Selecting N Clusters With Elbow Method

In [16]:
from sklearn.cluster import KMeans

Sum_of_squared_distances = []

# Use k from 1 to 15
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k, init='random', n_init=10, max_iter=500,  tol=1e-04, random_state=0)
    km = km.fit(X)
    # Get sum of square distances by applying km.inertia_ 
    Sum_of_squared_distances.append(km.inertia_)

# Plot Results
plt.plot(K, Sum_of_squared_distances, marker='o')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

Selecting N Clusters With Silhouette Analysis

In [17]:
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn import cluster, tree, decomposition

for n_cluster in range(2, 11):
    kmeans = KMeans(n_clusters=n_cluster).fit(X)
    label = kmeans.labels_
    sil_coeff = silhouette_score(X, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

Hierarchical Clustering

In [18]:
import scipy.cluster.hierarchy as sch
fig = ff.create_dendrogram(X,
                           linkagefun = lambda x: sch.linkage(x, "ward"),)

# Ward minimizes the variance of the points inside a cluster.

fig.update_layout(title = 'Hierarchical Clustering', xaxis_title='Customers',
                   yaxis_title='Euclidean Distance', width=700, height=700)

fig.show()

Cluster With KMeans

In [19]:
kmeans = KMeans(n_clusters = 5, init="k-means++", max_iter = 500, n_init = 10, random_state = 123)
identified_clusters = kmeans.fit_predict(X)


data_with_clusters = df.copy()
data_with_clusters['Cluster'] = identified_clusters
fig = px.scatter_3d(data_with_clusters, x = 'Age', y='Annual Income (k$)', z='Spending Score (1-100)',
              color='Cluster', opacity = 0.8, size='Age', size_max=30)
fig.show()

Cluster with AgglomerativeClustering

In [20]:
hc = AgglomerativeClustering(n_clusters = 5, affinity = "euclidean", linkage = "ward")
identified_clusters = hc.fit_predict(X)

data_with_clusters = df.copy()
data_with_clusters['Cluster'] = identified_clusters

fig = px.scatter_3d(data_with_clusters, x = 'Age', y='Annual Income (k$)', z='Spending Score (1-100)',
              color='Cluster', opacity = 0.8, size='Age', size_max=30)
fig.show()

Cluster with AffinityPropagation

In [21]:
ap = AffinityPropagation(random_state = 0)
identified_clusters = ap.fit_predict(X)

data_with_clusters = df.copy()
data_with_clusters['Cluster'] = identified_clusters

fig = px.scatter_3d(data_with_clusters, x = 'Age', y='Annual Income (k$)', z='Spending Score (1-100)',
              color='Cluster', opacity = 0.8, size='Age', size_max=30)
fig.show()

Cluster with DBSCAN

In [22]:
DBS = DBSCAN(eps = 9, min_samples = 5)   

identified_clusters = DBS.fit_predict(X)

data_with_clusters =df.copy()
data_with_clusters['Cluster'] = identified_clusters

fig = px.scatter_3d(data_with_clusters, x = 'Age', y='Annual Income (k$)', z='Spending Score (1-100)',
              color='Cluster', opacity = 0.8, size='Age', size_max=30)
fig.show()