# Hierarchial Clustering

In [1]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Problem Statement

__There is a big mall in a specific city that contains information of its clients, the clients that subscribe to the membership card when the client subscribe to the card they provided their information like their gender, age, and annual income,  
and because they have this card they use it to buy all sorts of things in the mall therefore the mall has the purchase history of each of its client member and that's how they obtained the last column in the dataset which is a spending score  
Spending score is a score that the mall computed for each of its clients based on several criteria including for eg. their income , the no of times per week they show up in the mall and of course the amount of dollars they spent in a year and based on all this they computed this metric that takes value b/w 1 and 100  
And eventually after collecting this dataset the company hired you as a machine learning scientist to segment their clients into differernt groups based on the annual income and the spending score and since the mall has no idea of what this client segments might be or even has no idea about how many segments there would be therefore this is typically a clustering problem becaue we don't know the answers__

In [2]:
# importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
dataset = pd.read_csv('Mall_Customers.csv')
dataset.head(10)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


In [6]:
X = dataset.iloc[:, [3, 4]].values

![title](intuitions/10.png)

In [10]:
%matplotlib notebook

In [13]:
# Using the dendogram to find the optimal number of clusters
import scipy.cluster.hierarchy as sch
dendogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Euclidean distances')

In [16]:
# Fitting hierarchical clustering to our dataset
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

In [17]:
y_hc

array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
       4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
       1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
       0, 2], dtype=int32)

In [20]:
# visualising the cluster
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x1330ec10>

### Cluster 1  
__Clients in cluster 1 have high income and no spending score so in this cluster clients earn a high income but don't spend much money so we could call this cluster of clients  _'Careful'_  clients__  
### Cluster 2
__have average income and average spending score so let's call this cluster of clients _'Standard'___  
### Cluster 3
__have high income and high spending score so that is the cluster of clients that would be the main potential target of the mall marketing campaigns and so it would be very insightful for them all to understand what kind of products are bought by clients in this cluster so we can call them _'Target'___  
### Cluster 4
__low income and high spending score so let's call them _'careless'___  
### Cluster 5
__low income and low spending score so they are _'Sensible'___

In [22]:
# Visualising the clusters with the correct cluster label
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Careful')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Standard')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Target')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Careless')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Sensible')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x6171970>