**Import Library**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report


In [89]:
df = pd.read_csv('customer_segmentation_data.csv')
df.head(10)

Unnamed: 0,Customer_ID,DOB,Gender,Annual_Income,Spending_Score,Shopping_Category,Shopping_Frequency,Total_Spent
0,CUST0001,2000-05-16,Female,119743.0,10.0,Clothing,24,287.38
1,CUST0002,1965-06-08,Female,60585.0,77.0,Clothing,22,1026.31
2,CUST0003,1945-05-24,Female,66680.0,29.0,Groceries,25,483.43
3,CUST0004,1986-03-29,Female,119682.0,55.0,Electronics,10,658.25
4,CUST0005,1983-11-26,Male,67754.0,51.0,Toys,2,69.11
5,CUST0006,1998-05-29,Female,133923.0,57.0,Clothing,23,1755.73
6,CUST0007,1977-09-19,male,54064.0,67.0,Groceries,17,615.79
7,CUST0008,1963-03-17,M,47810.0,100.0,Toys,26,1243.06
8,CUST0009,2000-06-03,Female,68413.0,78.0,Toys,15,800.43
9,CUST0010,2001-10-15,Female,53421.0,19.0,Sports,25,253.75


In [90]:
df.dtypes

Customer_ID            object
DOB                    object
Gender                 object
Annual_Income         float64
Spending_Score        float64
Shopping_Category      object
Shopping_Frequency      int64
Total_Spent           float64
dtype: object

# 1. Data Cleaning

In [92]:
numeric = ['Annual_Income','Spending_Score','Shopping_Frequency','Total_Spent']

for i in numeric:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outlier = df[(df[i] < lower) | (df[i] > upper)]
    print(f"Kolom {i} : {len(outlier)} outlier")

Kolom Annual_Income : 0 outlier
Kolom Spending_Score : 0 outlier
Kolom Shopping_Frequency : 0 outlier
Kolom Total_Spent : 18 outlier


**Winzorization**

In [93]:
Q1 = df['Total_Spent'].quantile(0.25)
Q3 = df['Total_Spent'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df['Total_Spent'] = np.where(df['Total_Spent'] < lower_bound, lower_bound, df['Total_Spent'])
df['Total_Spent'] = np.where(df['Total_Spent'] > upper_bound, upper_bound, df['Total_Spent'])

In [94]:
df.isna().sum()

Customer_ID            0
DOB                    0
Gender                10
Annual_Income         21
Spending_Score        24
Shopping_Category     56
Shopping_Frequency     0
Total_Spent           44
dtype: int64

**Add Missing Values**

In [95]:
df['Total_Spent'] = df['Total_Spent'].fillna(df['Total_Spent'].median())
df['Annual_Income'] = df['Annual_Income'].fillna(df['Annual_Income'].mean())
df['Spending_Score'] = df['Spending_Score'].fillna(df['Spending_Score'].mean())
df['Shopping_Frequency'] = df['Shopping_Frequency'].fillna(df['Shopping_Frequency'].mean())
df['Shopping_Category'] = df['Shopping_Category'].fillna(df['Shopping_Category'].mode()[0])

**Standarize Gender only Male and Female**

In [96]:
df['Gender'] = df['Gender'].str.strip().str.lower()
df['Gender'] =df['Gender'].replace({
    'male' : 'Male', 'm' : 'Male', 
    'female' : 'Female', 'f' : 'Female'
})

df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])


In [97]:
df.head(10)

Unnamed: 0,Customer_ID,DOB,Gender,Annual_Income,Spending_Score,Shopping_Category,Shopping_Frequency,Total_Spent
0,CUST0001,2000-05-16,Female,119743.0,10.0,Clothing,24,287.38
1,CUST0002,1965-06-08,Female,60585.0,77.0,Clothing,22,1026.31
2,CUST0003,1945-05-24,Female,66680.0,29.0,Groceries,25,483.43
3,CUST0004,1986-03-29,Female,119682.0,55.0,Electronics,10,658.25
4,CUST0005,1983-11-26,Male,67754.0,51.0,Toys,2,69.11
5,CUST0006,1998-05-29,Female,133923.0,57.0,Clothing,23,1755.73
6,CUST0007,1977-09-19,Male,54064.0,67.0,Groceries,17,615.79
7,CUST0008,1963-03-17,Male,47810.0,100.0,Toys,26,1243.06
8,CUST0009,2000-06-03,Female,68413.0,78.0,Toys,15,800.43
9,CUST0010,2001-10-15,Female,53421.0,19.0,Sports,25,253.75


**Encoding Categorical Data**