In [None]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
import sklearn.cluster as cluster

In [None]:
df = pd.read_csv('Mall_Customers.csv')

In [None]:
df.head()

# Data mapping


In [None]:
df_segment = df.copy()

In [None]:
df_segment['Gender'] = df_segment['Gender'].map({'Male':0, 'Female':1})
df_segment

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# To find out depencies between variables

df_segment.corr()

# Visualization

In [None]:
sns.countplot(df['Gender'])
plt.title('Distribution of Gender', fontsize=15)

In this data, the number of female customers are more than the male customers.

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df_segment['Age'])
plt.title('Distribution of Age ', fontsize=20)
plt.xlabel('Range of Customer Age')
plt.ylabel('Frequency')
plt.show()

We observed that, the range of mall customer's age is between 18 yrs and 70 yrs. The average age of target customers is about 30 years.

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df_segment['Annual Income (k$)'])
plt.title('Distribution of Annual income ', fontsize=20)
plt.xlabel('Income of customers')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(df_segment['Spending Score (1-100)'])
plt.title('Distribution of Spending of income ', fontsize=20)
plt.xlabel('Range of Income spend')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.pairplot(df.drop('CustomerID', axis=1), hue='Gender')
plt.show()

The below scatter plot visualizes, how gender values are distributed with respect to income and its spending. 

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(df_segment['Annual Income (k$)'], df_segment['Spending Score (1-100)'], hue=df['Gender'],
                palette= ['red','green'] ,alpha=0.6)
plt.title('Distribution of Gender based on Annual Income and Spending Score', fontsize = 15)
plt.xlabel('Annual Income', fontsize = 12)
plt.ylabel('Spending Score', fontsize = 12)
plt.show()

The below scatter plot visualizes, distribution of range of ages of mall customers with respect to income and its spending.

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(df_segment['Annual Income (k$)'], df_segment['Spending Score (1-100)'], hue=df_segment['Age'], )
plt.title('Distribution of Age based on Annual Income and Spending Score', fontsize = 15)
plt.xlabel('Annual Income', fontsize = 12)
plt.ylabel('Spending Score', fontsize = 12)
plt.show()

# Analysing annual income and spending score

In [None]:
#Scatterplot between income and spending score

plt.figure(figsize=(15,8))
sns.scatterplot(x = 'Annual Income (k$)',y = 'Spending Score (1-100)',  data = df_segment)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)') 
plt.title('Spending Score (1-100) vs Annual Income (k$)', fontsize=18)
plt.show()

In [None]:
Income_Spend = df_segment[['Annual Income (k$)' , 'Spending Score (1-100)']].iloc[: , :].values

In [None]:
# Calculate value of WCSS, to find no of clusters(k)

wcss = []
for i in range(1, 11):
    km = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    km.fit(Income_Spend)
    wcss.append(km.inertia_)

In [None]:
# Plot elbow curve to determine optimal value of k

plt.figure(figsize=(15,8))
plt.plot(range(1, 11), wcss)
plt.title('The Elbow curve', fontsize=15)
plt.xlabel('No. of Clusters')
plt.ylabel('wcss')
plt.show()   

We know that, value of k is 5. As the curve stops decreasing drastically after the value 5. Hence, we have to create total 5 clusters of columns Annual income and Spending score.

In [None]:
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)

y_means = km.fit_predict(Income_Spend)

In [None]:
df_segment['labels'] = km.labels_

In [None]:
df_segment

In [None]:
df_segment['labels'].count()

In [None]:
#Scatterplot of the clusters

plt.figure(figsize=(12,8))
sns.scatterplot(x = 'Annual Income (k$)',y = 'Spending Score (1-100)', hue="labels",  
                palette=['green','orange','brown','blue','red'], legend='full',data = df_segment, s = 60)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)') 
plt.title('Spending Score (1-100) vs Annual Income (k$)', fontsize = 18)
plt.show()

In [None]:
# To define centroids of each cluster

km.cluster_centers_

# Customer Segmentation w.r.t. Income and spending

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(Income_Spend[y_means == 0, 0], Income_Spend[y_means == 0, 1], s = 20, c = 'green', label = 'Average')
plt.scatter(Income_Spend[y_means == 1, 0], Income_Spend[y_means == 1, 1], s = 20, c = 'orange', label = 'Spenders')
plt.scatter(Income_Spend[y_means == 2, 0], Income_Spend[y_means == 2, 1], s = 20, c = 'brown', label = 'Best')
plt.scatter(Income_Spend[y_means == 3, 0], Income_Spend[y_means == 3, 1], s = 20, c = 'blue', label = 'Low Budget')
plt.scatter(Income_Spend[y_means == 4, 0], Income_Spend[y_means == 4, 1], s = 20, c = 'red', label = 'Saver')
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 70, c = 'black' , label = 'centeroid')
plt.legend()
plt.title('Customer Segmentation using Annual Income and Spending Score', fontsize = 20)
plt.xlabel('Annual Income', fontsize = 12)
plt.ylabel('Spending Score', fontsize = 12)
plt.show()

# #Histogram method to compare the results

In [None]:
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(10,10))
dendrogram = sch.dendrogram(sch.linkage(df_segment, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(df_segment)

In [None]:
# Visualising the clusters

plt.figure(figsize=(15,8))
plt.scatter(Income_Spend[y_hc == 0, 0], Income_Spend[y_hc == 0, 1], s = 40, c = 'red', label = 'Average Customers')
plt.scatter(Income_Spend[y_hc == 1, 0], Income_Spend[y_hc == 1, 1], s = 40, c = 'blue', label = 'Spenders Customers')
plt.scatter(Income_Spend[y_hc == 2, 0], Income_Spend[y_hc == 2, 1], s = 40, c = 'green', label = 'Best Customers')
plt.scatter(Income_Spend[y_hc == 3, 0], Income_Spend[y_hc == 3, 1], s = 40, c = 'cyan', label = 'Low budget Customers')
plt.scatter(Income_Spend[y_hc == 4, 0], Income_Spend[y_hc == 4, 1], s = 40, c = 'magenta', label = 'Saver Customers')
plt.title('Segmentation of customers data using Hierarchical Clustering')
plt.xlabel('Annual Income (K$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()