In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

# Read the data

In [2]:
df = pd.read_csv('Mall_Customers.csv')
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


### Check for null values

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


### Check for the descriptive stats

In [4]:
df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].describe()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0
mean,38.85,60.56,50.2
std,13.969007,26.264721,25.823522
min,18.0,15.0,1.0
25%,28.75,41.5,34.75
50%,36.0,61.5,50.0
75%,49.0,78.0,73.0
max,70.0,137.0,99.0


# Process the data

In [5]:
# Get only the spending score and annual income
X = df.iloc[:,[3, 4]].values

plt.figure()
plt.scatter(X.T[0], X.T[1])
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Spending Score')

# Apply KMeans

In [6]:
wcss = [] # Distance of each point from the cluster
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)



### Elbow method

In [7]:
plt.figure()
plt.plot(range(1,11), wcss, '.-')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS Values')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'WCSS Values')

# n=5 is the optimal number of clusters

In [8]:
# KMeans with n=5
kmeansmodel = KMeans(n_clusters=5, init='k-means++', random_state=0)
y_kmeans = kmeansmodel.fit_predict(X)

# Assigns the point to a corresponding cluster
df['Cluster'] = y_kmeans



# Visualization

In [9]:
plt.figure()

# Visualizes the clusters
plt.scatter(df['Annual Income (k$)'], df['Spending Score (1-100)'], c=df['Cluster'])

# Cluster centers
plt.scatter(kmeansmodel.cluster_centers_.T[0], kmeansmodel.cluster_centers_.T[1], marker='x', c='red', s=80)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1c0cb327a90>

# Customer IDs belonging to the individual clusters

In [10]:
Cluster_CustomerIDs = dict(zip(df['Cluster'].unique(), 
                               [df[df['Cluster']==i]['CustomerID'].values.tolist() for i in df['Cluster'].unique()]))
for i in range(len(Cluster_CustomerIDs)):
    print('i={}\n\tCustomerIDs: {}\n'.format(i, Cluster_CustomerIDs[i]))

i=0
	CustomerIDs: [125, 129, 131, 135, 137, 139, 141, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199]

i=1
	CustomerIDs: [44, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 127, 133, 143]

i=2
	CustomerIDs: [124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200]

i=3
	CustomerIDs: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 46]

i=4
	CustomerIDs: [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45]

