In [None]:
# Setelah di-upload, baca dataset dengan pandas
import pandas as pd

# Ganti 'nama_file.csv' sesuai nama file kamu
# df = pd.read_csv('Shopping Mall Customer Segmentation Data .csv')
df = pd.read_csv('retail_data.csv')

# Tampilkan 5 data teratas
df.head()

In [2]:
# Cek dimensi dan tipe data
print("Ukuran dataset:", df.shape)
print("\nTipe data:")
print(df.dtypes)

# Pilih hanya 6 atribut numerik yang digunakan untuk clustering
selected_features = ['Age', 'Annual_Income', 'Spending_Score']

data = df[selected_features]
print("\nStatistik deskriptif fitur yang dipilih:")
print(data.describe())

Ukuran dataset: (200000, 19)

Tipe data:
CustomerID                int64
Age                       int64
Gender                   object
Annual_Income             int64
Spending_Score            int64
ProductCategory          object
ProductPrice            float64
PurchaseDate             object
StoreID                   int64
StoreLocation            object
PaymentMethod            object
DiscountApplied            bool
DiscountPercent           int64
ProductCost             float64
Profit                  float64
FootTraffic               int64
InventoryLevel            int64
MarketingExpenditure    float64
CompetitorPrice         float64
dtype: object

Statistik deskriptif fitur yang dipilih:
                 Age  Annual_Income  Spending_Score
count  200000.000000  200000.000000   200000.000000
mean       43.969475   85058.438785       50.417900
std        15.277401   37511.311914       28.864392
min        18.000000   20001.000000        1.000000
25%        31.000000   52577.000000

In [3]:
# Tahap 2: Normalisasi menggunakan Min-Max Scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Konversi ke DataFrame agar lebih mudah dibaca
import numpy as np
data_scaled_df = pd.DataFrame(data_scaled, columns=selected_features)
data_scaled_df.head()

Unnamed: 0,Age,Annual_Income,Spending_Score
0,0.769231,0.307979,0.40404
1,0.75,0.089047,0.909091
2,0.403846,0.74926,0.252525
3,0.288462,0.518719,0.939394
4,0.0,0.540189,0.090909


In [None]:
# Tahap 3: Menentukan jumlah klaster optimal dengan Elbow Method
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

sse = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(data_scaled)
    sse.append(kmeans.inertia_)

# Visualisasi metode Elbow
plt.figure(figsize=(8, 5))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Jumlah Klaster (k)')
plt.ylabel('SSE')
plt.title('Metode Elbow untuk Menentukan Jumlah Klaster Optimal')
plt.grid(True)
plt.show()

In [None]:
# Tahap 4: Evaluasi dengan Silhouette Score untuk k=2 sampai k=10
from sklearn.metrics import silhouette_score

print("\nEvaluasi Silhouette Score untuk setiap jumlah klaster (k):")
print("=" * 50)
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(data_scaled)
    sil_score = silhouette_score(data_scaled, cluster_labels)
    print(f"Jumlah Klaster: {k} -> Silhouette Score: {sil_score:.4f}")

In [None]:
# Tahap 5: Clustering akhir (gunakan jumlah klaster optimal, misalnya k=4)
optimal_k = 6  # Ganti sesuai hasil Elbow/Silhouette terbaik
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(data_scaled)

# Tambahkan hasil klaster ke dataframe asli
df['cluster'] = clusters

# Lihat hasil awal klaster
print("\nData dengan label klaster:")
print(df[['cluster'] + selected_features].head())

# Tahap 6: Ringkasan hasil klaster
cluster_summary = df.groupby('cluster')[selected_features].mean().round(2)
print("\nRata-rata tiap fitur untuk masing-masing klaster:")
print(cluster_summary)

In [None]:
from sklearn.decomposition import PCA

# Reduksi dimensi dari 6D ke 2D untuk visualisasi
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)

# Tambahkan kolom PCA ke dataframe
df['pca1'] = data_pca[:, 0]
df['pca2'] = data_pca[:, 1]

# Visualisasi hasil klaster dalam 2D
plt.figure(figsize=(8, 6))
for cluster_id in range(optimal_k):
    cluster_data = df[df['cluster'] == cluster_id]
    plt.scatter(cluster_data['pca1'], cluster_data['pca2'], label=f'Cluster {cluster_id}')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('Visualisasi Klaster K-Means (2D PCA)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

sil_score = silhouette_score(data_scaled, df['cluster'])
print(f"\nSilhouette Score untuk k = {optimal_k}: {sil_score:.4f}")