In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv


# Imports

In [2]:
# data wrangling
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import plotly.express as px

# machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [3]:
# reading dataset
df = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

# Exploratory Data Analysis

In [4]:
# first 5 rows to see the columns and a preview of how data is presented
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [5]:
# number of rows and columns
df.shape

(200, 5)

In [6]:
# looking for null data
df.isnull().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [7]:
# describing main statistics with 2 decimal places
df.describe().round(2)

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.88,13.97,26.26,25.82
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


The age range is between 18 and 70 years, those people earn between 15k and 137k each year.

In [8]:
# aggrouping by gender and counting the number of customers
df1 = df.groupby(by = ['Gender']).agg({'CustomerID':'count'}).reset_index().sort_values(by = 'CustomerID',
                                                                                       ascending = False)

# aggrouping by age and counting the number of customers
df2 = df.groupby(by = ['Age']).agg({'CustomerID':'count'}).reset_index().sort_values(by = 'CustomerID',
                                                                                       ascending = False)

# aggrouping by spending score and counting the number of customers
df3 = df.groupby(by = ['Spending Score (1-100)']).agg({'CustomerID':'count'}).reset_index().sort_values(by = 'CustomerID',
                                                                                       ascending = False)

In [9]:
# plotting customer's genders
fig = px.bar(data_frame = df1,
             x = 'Gender', y = 'CustomerID', text_auto = True, template = 'plotly_dark')

fig.update_layout(
    title = {
        'text': 'Customer genders'},
    xaxis_title = 'Gender',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

As we can see, the number of women clients is slightly higher than men. 

In [10]:
# plotting customer's ages
fig = px.bar(data_frame = df2,
             x = 'Age', y = 'CustomerID', text_auto = True, template = 'plotly_dark')

fig.update_layout(
    title = {
        'text': 'Customer age'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

In [11]:
# plotting customer's spending scores
fig = px.bar(data_frame = df3,
             x = 'Spending Score (1-100)', y = 'CustomerID', template = 'plotly_dark')

fig.update_layout(
    title = {
        'text': 'Customer spending score'},
    xaxis_title = 'Spending Score',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

# fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

In [12]:
# label encoder
labelencoder = LabelEncoder()

In [13]:
# encoding gender variable
df['Gender'] = labelencoder.fit_transform(df['Gender'])

In [14]:
# droping unuseful columns to the model (CustomerID) and extracting only the values into a numpy array
X = df.drop(columns = ['CustomerID']).iloc[: , :].values

In [15]:
# calculating the inertia for each number of clusters from 1 to 15
k_inertia = []
for i in range (1, 15):
    kmeans = KMeans(n_clusters = i, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X)
    k_inertia.append(kmeans.inertia_)

In [16]:
k_inertia

[308862.06,
 212889.44245524297,
 143391.59236035676,
 104414.67534220166,
 75399.61541401483,
 58348.64136331505,
 51132.7032125769,
 44392.115665679354,
 41000.874221320715,
 37649.69225429743,
 34665.087277598795,
 31659.187454375693,
 29388.61288883975,
 28170.631036266233]

In [17]:
df_inertia = pd.DataFrame(k_inertia)

In [18]:
df_inertia = df_inertia.rename(columns = {0:'inertias'})

In [19]:
df_inertia

Unnamed: 0,inertias
0,308862.06
1,212889.442455
2,143391.59236
3,104414.675342
4,75399.615414
5,58348.641363
6,51132.703213
7,44392.115666
8,41000.874221
9,37649.692254


In [20]:
# plotting inertias
fig = px.line(data_frame = df_inertia, y = 'inertias',
             template = 'plotly_dark', markers = True)

fig.update_layout(
    title = {
        'text': 'The Elbow Method'},
    xaxis_title = 'Clusters',
    yaxis_title = 'Inertia',
    width = 1000,
    height = 500,
    font_family="Arial",
    font_color="White",
    title_font_family="Arial",
    title_font_color= "White")

fig.show()

Based on the elbow method, I've chosen 6 numbers of clusters.

In [21]:
# defining parameters to create the model
kmeans = KMeans(n_clusters = 6, init = 'k-means++', n_init = 20, max_iter = 500,
               random_state = 8254)

In [22]:
# fitting the model
kmeans.fit(X)

KMeans(max_iter=500, n_clusters=6, n_init=20, random_state=8254)

In [23]:
# creating array with labels
labels = kmeans.labels_

In [24]:
# creating column in dataframe with labels
df['Clusters'] = pd.Series(labels).astype('int64')

In [25]:
# agroupping clusters by age, annual income and spending score and sorting by spending score
df.groupby(by = 'Clusters').agg({'Age':'mean', 
                                 'Annual Income (k$)':'mean',
                                 'Spending Score (1-100)':'mean'}).reset_index().round(2).sort_values(by = ['Spending Score (1-100)'],
                                                                                            ascending = [False])

Unnamed: 0,Clusters,Age,Annual Income (k$),Spending Score (1-100)
2,2,32.69,86.54,82.13
1,1,25.27,25.73,79.36
3,3,27.0,56.66,49.13
5,5,56.16,53.38,49.09
4,4,44.14,25.14,19.52
0,0,41.69,88.23,17.29


In [26]:
# creating classification with clusters based on spending score
df['Classification'] = df['Clusters'].apply(lambda x: 'Exclusive' if x == 2
                                                 else 'Premium' if x == 1
                                                 else 'Executive' if x == 3
                                                 else 'Gold' if x == 5
                                                 else 'Silver' if x == 4
                                                 else 'Bronze')

In [27]:
# dropping 'Clusters' coluns because we already created our classification
df.drop(columns = ['Clusters'], inplace = True)

In [28]:
# seeing the first 5 dataframe's rows with the clustering done
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100),Classification
0,1,1,19,15,39,Silver
1,2,1,21,15,81,Premium
2,3,0,20,16,6,Silver
3,4,0,23,16,77,Premium
4,5,0,31,17,40,Silver


# EDA on clusters

In [29]:
# filtering dataframe by Exclusive classification
df_exclusive = df[df['Classification'] == 'Exclusive']

# filtering dataframe by Premium classification
df_premium = df[df['Classification'] == 'Premium']

# filtering dataframe by Executive classification
df_executive = df[df['Classification'] == 'Executive']

# filtering dataframe by Gold classification
df_gold = df[df['Classification'] == 'Gold']

# filtering dataframe by Silver classification
df_silver = df[df['Classification'] == 'Silver']

# filtering dataframe by Bronze classification
df_bronze = df[df['Classification'] == 'Bronze']

## Exclusive

### Age

In [30]:
# Plotting the number of Exclusive customers by age
fig = px.bar(data_frame = df_exclusive.groupby(by = 'Age').agg({'CustomerID':'count'}).reset_index(),
             x = 'Age', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Age Distribuition (Exclusive Clients)'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Annual Income

In [31]:
# Plotting the number of Exclusive customers by annual income
fig = px.bar(data_frame = df_exclusive.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Exclusive clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Spending Score

In [32]:
# Plotting the number of Exclusive customers by spending score
fig = px.bar(data_frame = df_exclusive.groupby(by = 'Spending Score (1-100)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Spending Score (1-100)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Spending score distribuition (Exclusive clients)'},
    xaxis_title = 'Spending Score',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

## Premium

### Age

In [33]:
# Plotting the number of Premium customers by age
fig = px.bar(data_frame = df_premium.groupby(by = 'Age').agg({'CustomerID':'count'}).reset_index(),
             x = 'Age', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Age Distribuition (Premium Clients)'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Annual Income

In [34]:
# Plotting the number of Premium customers by annual income
fig = px.bar(data_frame = df_premium.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Premium clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Spending Score

In [35]:
# Plotting the number of Premium customers by spending score
fig = px.bar(data_frame = df_premium.groupby(by = 'Spending Score (1-100)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Spending Score (1-100)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Spending score distribuition (Premium clients)'},
    xaxis_title = 'Spending Score',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

## Executive

### Age

In [36]:
# Plotting the number of Exclusive customers by age
fig = px.bar(data_frame = df_executive.groupby(by = 'Age').agg({'CustomerID':'count'}).reset_index(),
             x = 'Age', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Age Distribuition (Executive Clients)'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Annual Income

In [37]:
# Plotting the number of Exclusive customers by annual income
fig = px.bar(data_frame = df_executive.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Executive clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Spending score

In [38]:
# Plotting the number of Exclusive customers by spending score
fig = px.bar(data_frame = df_executive.groupby(by = 'Spending Score (1-100)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Spending Score (1-100)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Spending score distribuition (Executive clients)'},
    xaxis_title = 'Spending Score',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

## Gold

### Age

In [39]:
# Plotting the number of Gold customers by age
fig = px.bar(data_frame = df_gold.groupby(by = 'Age').agg({'CustomerID':'count'}).reset_index(),
             x = 'Age', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Age Distribuition (Gold Clients)'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Anual Income

In [40]:
# Plotting the number of Gold customers by annual income
fig = px.bar(data_frame = df_gold.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Gold clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Spending Score

In [41]:
# Plotting the number of Gold customers by spending score
fig = px.bar(data_frame = df_gold.groupby(by = 'Spending Score (1-100)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Spending Score (1-100)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Spending score distribuition (Gold clients)'},
    xaxis_title = 'Spending Score',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

## Silver

### Age

In [42]:
# Plotting the number of Silver customers by age
fig = px.bar(data_frame = df_silver.groupby(by = 'Age').agg({'CustomerID':'count'}).reset_index(),
             x = 'Age', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Age Distribuition (Silver Clients)'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Anual Income

In [43]:
# Plotting the number of Silver customers by annual income
fig = px.bar(data_frame = df_silver.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Silver clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Spending Score

In [44]:
# Plotting the number of Silver customers by spending score
fig = px.bar(data_frame = df_silver.groupby(by = 'Spending Score (1-100)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Spending Score (1-100)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Spending score distribuition (Silver clients)'},
    xaxis_title = 'Spending Score',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

## Bronze

### Age

In [45]:
# Plotting the number of Bronze customers by age
fig = px.bar(data_frame = df_bronze.groupby(by = 'Age').agg({'CustomerID':'count'}).reset_index(),
             x = 'Age', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Age Distribuition (Bronze Clients)'},
    xaxis_title = 'Age',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Anual Income

In [46]:
# Plotting the number of Bronze customers by annual income
fig = px.bar(data_frame = df_bronze.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Bronze clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()

### Spending Score

In [47]:
# Plotting the number of Bronze customers by spending score
fig = px.bar(data_frame = df_bronze.groupby(by = 'Annual Income (k$)').agg({'CustomerID':'count'}).reset_index(),
             x = 'Annual Income (k$)', y = 'CustomerID', template = 'plotly_dark', text_auto = True)

fig.update_layout(
    title = {
        'text': 'Anual income distribuition (Bronze clients)'},
    xaxis_title = 'Annual Income (k$)',
    yaxis_title = 'Client quantity',
    font_family="Arial",
    font_color="White",
    font=dict(size = 18),
    title_font_family="Arial",
    title_font_color= "White",
    uniformtext_mode='hide')

fig.update_traces(textposition="outside", textfont_size = 18, cliponaxis=True)

fig.show()