In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans

In [3]:
# Importing the dataset
df = pd.read_csv('Mall_Customers.csv')
df.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [4]:
%matplotlib notebook
sns.jointplot(df['Spending Score (1-100)'], df['Age'], kind = 'scatter')

<IPython.core.display.Javascript object>

<seaborn.axisgrid.JointGrid at 0x1f21bf0d648>

In [5]:
x = df.iloc[:, [3,4]].values

In [6]:
x[:2]

array([[15, 39],
       [15, 81]], dtype=int64)

### Elbow method to find number of clusters

In [7]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', 
                    max_iter=300, n_init=10, random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
    

In [10]:
%matplotlib notebook
plt.figure(figsize = (6,3))
plt.plot(range(1, 11), wcss, marker = 'o')
plt.xlabel('Level')
plt.ylabel('Salary')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [11]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(x)

In [12]:
x[y_kmeans==4,1]

array([29, 11,  9,  5,  7, 10,  5, 12, 36, 22, 17, 20, 16,  1,  1, 35,  5,
       26, 20, 27, 13, 10, 13, 15, 14, 32, 15, 39, 24, 17, 23,  8, 16, 28,
       18], dtype=int64)

In [13]:
x[y_kmeans==4, 0]

array([ 70,  71,  71,  73,  73,  74,  75,  77,  77,  78,  78,  78,  78,
        78,  78,  79,  81,  85,  86,  87,  87,  87,  88,  88,  93,  97,
        98,  99, 101, 103, 103, 113, 120, 126, 137], dtype=int64)

In [15]:
# Visualising the clusters
%matplotlib notebook
plt.scatter(x[y_kmeans == 0, 0], x[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(x[y_kmeans == 1, 0], x[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(x[y_kmeans == 2, 0], x[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(x[y_kmeans == 3, 0], x[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(x[y_kmeans == 4, 0], x[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')

plt.legend()
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score')
plt.savefig('Clusters.png', dip = 500)

<IPython.core.display.Javascript object>