## K-Means Clustering
Build a model that creates customer groups who are frequently visiting the mall on basis of Age,Income and Spending

In [1]:
path = r'https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/Mall_Customers.csv'
import pandas as pd
df = pd.read_csv(path)
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


Perform basic data quality checks

In [2]:
df.shape

(200, 5)

In [3]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   CustomerID              200 non-null    int64
 1   Gender                  200 non-null    str  
 2   Age                     200 non-null    int64
 3   Annual Income (k$)      200 non-null    int64
 4   Spending Score (1-100)  200 non-null    int64
dtypes: int64(4), str(1)
memory usage: 7.9 KB


In [4]:
df.columns

Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='str')

In [5]:
## Duplicated rows
df.duplicated().sum()    

np.int64(0)

In [6]:
df=df.drop_duplicates()

In [7]:
# Missing values
df.isna().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

#### Separate X features. Consider the features for which you want clusters to be formed.

You can perform EDA to decide which features to consider for cluster creation

In [8]:

X = df.drop(columns=['Gender','CustomerID'])
X.head()

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100)
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40


In [9]:

X.columns = ['Age','Income','Spending']
X.columns

Index(['Age', 'Income', 'Spending'], dtype='str')

## Data Cleaning and Preprocessing

In [10]:

X.dtypes

Age         int64
Income      int64
Spending    int64
dtype: object

In [11]:

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
).set_output(transform='pandas')

X_pre = num_pipe.fit_transform(X)
X_pre.head()

Unnamed: 0,Age,Income,Spending
0,-1.424569,-1.738999,-0.434801
1,-1.281035,-1.738999,1.195704
2,-1.352802,-1.70083,-1.715913
3,-1.137502,-1.70083,1.040418
4,-0.563369,-1.66266,-0.39598


## Model Building

In [12]:
pip install yellowbrick

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import kelbow_visualizer

ModuleNotFoundError: No module named 'distutils'

In [17]:

import matplotlib.pyplot as plt

base_model = KMeans()
visualizer = kelbow_visualizer(base_model,X_pre,k=(2,11),timings=False)
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
visualizer.show()

NameError: name 'kelbow_visualizer' is not defined

In [16]:
## Build the model with K=6
final_model = KMeans(n_clusters=6) # k=n_clusters
clusters = final_model.fit_predict(X_pre) # cluster creation : 6 clusters , cluster information for each datapoint
clusters

array([5, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5,
       3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 4, 5, 3, 5,
       3, 5, 4, 0, 0, 0, 4, 0, 0, 4, 4, 4, 4, 4, 0, 4, 4, 0, 4, 4, 4, 0,
       4, 4, 0, 0, 4, 4, 4, 4, 4, 0, 4, 0, 0, 4, 4, 0, 4, 4, 0, 4, 4, 0,
       0, 4, 4, 0, 4, 0, 0, 0, 4, 0, 4, 0, 0, 4, 4, 0, 4, 0, 4, 4, 4, 4,
       4, 0, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 2, 0, 2, 1, 2, 1, 2, 1, 2,
       0, 2, 1, 2, 1, 2, 0, 2, 1, 2, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 4, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
       1, 2], dtype=int32)

In [18]:
final_model.cluster_centers_

array([[-0.87309844, -0.11378508, -0.09358039],
       [ 0.22171558,  1.08322527, -1.29005223],
       [-0.44191719,  0.99158305,  1.23950275],
       [ 0.47895722, -1.30822992, -1.19644353],
       [ 1.25472096, -0.24021294, -0.04399777],
       [-0.97602698, -1.32549704,  1.0371827 ]])

In [14]:
X.head()

Unnamed: 0,Age,Income,Spending
0,19,15,39
1,21,15,81
2,20,16,6
3,23,16,77
4,31,17,40


In [19]:
X['Customer_Groups'] = clusters
X.head()

Unnamed: 0,Age,Income,Spending,Customer_Groups
0,19,15,39,5
1,21,15,81,5
2,20,16,6,3
3,23,16,77,5
4,31,17,40,3


In [20]:
for i in range(6):
  print(f'Cluster {i}')
  display(X[X['Customer_Groups']==i])
  print("===============")

Cluster 0


Unnamed: 0,Age,Income,Spending,Customer_Groups
47,27,40,47,0
48,29,40,42,0
49,31,40,42,0
51,33,42,60,0
52,31,43,54,0
58,27,46,51,0
61,19,46,55,0
65,18,48,59,0
68,19,48,59,0
69,32,48,47,0


Cluster 1


Unnamed: 0,Age,Income,Spending,Customer_Groups
126,43,71,35,1
128,59,71,11,1
130,47,71,9,1
134,20,73,5,1
136,44,73,7,1
140,57,75,5,1
144,25,77,12,1
146,48,77,36,1
148,34,78,22,1
150,43,78,17,1


Cluster 2


Unnamed: 0,Age,Income,Spending,Customer_Groups
123,39,69,91,2
125,31,70,77,2
127,40,71,95,2
129,38,71,75,2
131,39,71,75,2
133,31,72,71,2
135,29,73,88,2
137,32,73,73,2
139,35,74,72,2
141,32,75,93,2


Cluster 3


Unnamed: 0,Age,Income,Spending,Customer_Groups
2,20,16,6,3
4,31,17,40,3
6,35,18,6,3
8,64,19,3,3
10,67,19,14,3
12,58,20,15,3
14,37,20,13,3
16,35,21,35,3
18,52,23,29,3
20,35,24,35,3


Cluster 4


Unnamed: 0,Age,Income,Spending,Customer_Groups
40,65,38,35,4
46,50,40,55,4
50,49,42,52,4
53,59,43,60,4
54,50,43,45,4
55,47,43,41,4
56,51,44,50,4
57,69,44,46,4
59,53,46,46,4
60,70,46,56,4


Cluster 5


Unnamed: 0,Age,Income,Spending,Customer_Groups
0,19,15,39,5
1,21,15,81,5
3,23,16,77,5
5,22,17,76,5
7,23,18,94,5
9,30,19,72,5
11,35,19,99,5
13,24,20,77,5
15,22,20,79,5
17,20,21,66,5




Clusters- Visualization

In [21]:

X.columns

Index(['Age', 'Income', 'Spending', 'Customer_Groups'], dtype='str')

In [22]:
import plotly.express as ex
fig = ex.scatter_3d(X,x='Age',y='Income',z='Spending',color='Customer_Groups')
fig.show()

## Save the X file

In [23]:
X.to_csv("Mall_Customer_Groups.csv",index=False)