In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install sklearn
!pip install plotly


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [None]:
#Load customers data
customersdata = pd.read_csv("Mall_Customers_Data.csv")

In [None]:
#viewing the dataset
customersdata

Unnamed: 0,Mall Customer ID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [None]:
customersdata.shape

(200, 6)

In [18]:
# Information about dataset
customersdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Mall Customer ID        200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
 5   clusters                200 non-null    int32 
dtypes: int32(1), int64(4), object(1)
memory usage: 8.7+ KB


In [None]:
# Discribing the dataset
customersdata.describe()

Unnamed: 0,Mall Customer ID,Age,Annual Income (k$),Spending Score (1-100),clusters
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2,2.19
std,57.879185,13.969007,26.264721,25.823522,1.208804
min,1.0,18.0,15.0,1.0,0.0
25%,50.75,28.75,41.5,34.75,1.0
50%,100.5,36.0,61.5,50.0,3.0
75%,150.25,49.0,78.0,73.0,3.0
max,200.0,70.0,137.0,99.0,4.0


In [None]:
# Define K-means model
kmeans_model = KMeans(init='k-means++',  max_iter=400, random_state=42)

In [None]:
# Train the model
kmeans_model.fit(customersdata[['Age','Annual Income (k$)','Spending Score (1-100)']])



In [None]:
# Create the K means model for different values of K
def try_different_clusters(K, data):

    cluster_values = list(range(1, K+1))
    inertias=[]

    for c in cluster_values:
        model = KMeans(n_clusters = c,init='k-means++',max_iter=400,random_state=42)
        model.fit(data)
        inertias.append(model.inertia_)

    return inertias

In [None]:
# Find output for k values between 1 to 12
outputs = try_different_clusters(12, customersdata[['Age','Annual Income (k$)','Spending Score (1-100)']])
distances = pd.DataFrame({"clusters": list(range(1, 13)),"sum of squared distances": outputs})



In [None]:
# Finding optimal number of clusters k
figure = go.Figure()
figure.add_trace(go.Scatter(x=distances["clusters"], y=distances["sum of squared distances"]))

figure.update_layout(xaxis = dict(tick0 = 1,dtick = 1,tickmode = 'linear'),
                  xaxis_title="Number of clusters",
                  yaxis_title="Sum of squared distances",
                  title_text="Finding optimal number of clusters using elbow method")
figure.show()

In [None]:
# Re-Train K means model with k=5
kmeans_model_new = KMeans(n_clusters = 5,init='k-means++',max_iter=400,random_state=42)

kmeans_model_new.fit_predict(customersdata[['Age','Annual Income (k$)','Spending Score (1-100)']])





array([0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4,
       0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 3,
       0, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 2, 1, 3, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
       2, 1], dtype=int32)

In [None]:
# Create data arrays
cluster_centers = kmeans_model_new.cluster_centers_
data = np.expm1(cluster_centers)
points = np.append(data, cluster_centers, axis=1)
points

array([[4.34173717e+19, 2.65358566e+11, 1.20898074e+09, 4.52173913e+01,
        2.63043478e+01, 2.09130435e+01],
       [1.57793399e+14, 3.82980197e+37, 4.65399911e+35, 3.26923077e+01,
        8.65384615e+01, 8.21282051e+01],
       [3.25560375e+17, 9.36317078e+37, 7.93349385e+07, 4.03243243e+01,
        8.74324324e+01, 1.81891892e+01],
       [5.36582750e+18, 6.44514846e+23, 4.39802724e+21, 4.31265823e+01,
        5.48227848e+01, 4.98354430e+01],
       [9.45814564e+10, 1.49009027e+11, 2.93217129e+34, 2.52727273e+01,
        2.57272727e+01, 7.93636364e+01]])

In [None]:
# Add "clusters" to customers data
# points = np.append(points, [[0], [1], [2], [3], [4]], axis=1)
customersdata["clusters"] = kmeans_model_new.labels_

In [None]:
# visualize clusters
figure = px.scatter_3d(customersdata,
                    color='clusters',
                    x="Age",
                    y="Annual Income (k$)",
                    z="Spending Score (1-100)",
                    # category_orders = {"clusters": ["0", "1", "2", "3", "4"]}
                    )
figure.update_layout()
figure.show()