In [7]:
# ==== 1) Install ====
!pip -q install -U pycaret pandas scikit-learn

# ==== 2) Load dataset ====
import pandas as pd

df = pd.read_csv("/kaggle/input/mall-customers/Mall_Customers.csv")
print(df.shape)
df.head()


(200, 5)


Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [8]:
# ==== 3) Clean data ====
# Drop ID column and non-numeric columns like Gender
df = df.drop(columns=["CustomerID", "Gender"], errors="ignore")

# Remove rows with missing values just in case
df = df.dropna().reset_index(drop=True)

print("Final shape for clustering:", df.shape)


Final shape for clustering: (200, 4)


In [9]:
# ==== 4) PyCaret Clustering ====
from pycaret.clustering import *

clus = setup(
    data=df,
    session_id=42,
    normalize=True,
    use_gpu=False,
    verbose=True
)

# Create a simple KMeans model
kmeans = create_model("kmeans", num_clusters=4)

# Assign clusters back to the data
labeled = assign_model(kmeans)

print("Clustered sample:")
print(labeled.head())

# Optional plots (uncomment if you want visuals)
# plot_model(kmeans, plot='elbow')
# plot_model(kmeans, plot='cluster')


Unnamed: 0,Description,Value
0,Session id,42
1,Original data shape,"(200, 4)"
2,Transformed data shape,"(200, 4)"
3,Numeric features,3
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2901,66.8561,1.3056,0,0,0


Clustered sample:
    Genre  Age  Annual Income (k$)  Spending Score (1-100)    Cluster
0    Male   19                  15                      39  Cluster 2
1    Male   21                  15                      81  Cluster 2
2  Female   20                  16                       6  Cluster 3
3  Female   23                  16                      77  Cluster 3
4  Female   31                  17                      40  Cluster 3


In [10]:
# ==== 5) Save outputs ====
labeled.to_csv("/kaggle/working/mall_clusters.csv", index=False)
save_model(kmeans, "/kaggle/working/mall_kmeans_model")

print("✅ Done — outputs saved to /kaggle/working/")
!ls -lh /kaggle/working


Transformation Pipeline and Model Successfully Saved
✅ Done — outputs saved to /kaggle/working/
total 72K
drwxr-xr-x 2 root root 4.0K Nov  2 09:35 cudf
drwxr-xr-x 3 root root 4.0K Nov  2 09:41 cuml
drwxr-xr-x 3 root root 4.0K Nov  2 09:41 cupy
-rw-r--r-- 1 root root  33K Nov  2 09:47 logs.log
-rw-r--r-- 1 root root 5.0K Nov  2 09:47 mall_clusters.csv
-rw-r--r-- 1 root root 7.3K Nov  2 09:47 mall_kmeans_model.pkl
drwxr-xr-x 3 root root 4.0K Nov  2 09:41 rmm
