# Clustering Mobil: K-Means dengan PCA Feature Selection
**Alur Pengerjaan:**
1. Data Cleaning & Parsing (Unit Removal)
2. Z-Score Scaling (Standarisasi Data)
3. Feature Selection dengan PCA (Eliminasi fitur yang tidak berkontribusi)
4. Reduksi Dimensi (PCA Final)
5. Penentuan K Optimal (Elbow Method Otomatis)
6. K-Means Clustering
7. Profiling & Segmentasi User

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import warnings

# Abaikan warning minor agar output bersih
warnings.filterwarnings('ignore')

# Konfigurasi Tampilan
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Buat folder output
if not os.path.exists('Hasil'):
    os.makedirs('Hasil')

# Load Dataset
file_path = 'Dataset/CarsDataset2025.csv' # Pastikan path sesuai dengan lokasi file Anda

try:
    df = pd.read_csv(file_path)
    print("Dataset berhasil diload.")
    print(f"   Dimensi: {df.shape[0]} Baris, {df.shape[1]} Kolom")
    display(df.head(3))
except FileNotFoundError:
    print(f"File tidak ditemukan di: {file_path}")

In [None]:
# --- 1. DEFINISI FUNGSI CLEANING ---
def clean_currency(value):
    """Membersihkan format mata uang ($, koma) dan menangani range harga."""
    if pd.isna(value): return np.nan
    val_str = str(value).replace('$', '').replace(',', '').replace(' ', '')
    
    # Handle range (misal: "10000-20000") -> ambil rata-rata
    if '-' in val_str:
        try:
            low, high = val_str.split('-')
            return (float(low) + float(high)) / 2
        except: return np.nan
    
    try: return float(val_str)
    except: return np.nan

def clean_unit(value):
    """Menghapus satuan (hp, km/h, sec) dan mengambil angka."""
    if pd.isna(value): return np.nan
    val_str = str(value).lower()
    
    # Hapus satuan umum
    for unit in ['cc', 'hp', 'km/h', 'sec', 'nm', 'seconds']:
        val_str = val_str.replace(unit, '')
    
    # Handle kasus "4.0/5.0" (ambil nilai pertama)
    if '/' in val_str: val_str = val_str.split('/')[0]
    
    val_str = val_str.replace(',', '').strip()
    
    # Handle range dengan regex
    if '-' in val_str:
        try:
            parts = re.findall(r"[\d\.]+", val_str)
            nums = [float(p) for p in parts]
            return sum(nums) / len(nums) if nums else np.nan
        except: return np.nan
        
    # Ambil angka pertama yang ditemukan
    match = re.search(r"[\d\.]+", val_str)
    if match: return float(match.group())
    return np.nan

# --- 2. TERAPKAN CLEANING ---
df_clean = df.copy()

# Daftar kolom yang akan dibersihkan
cols_to_clean = {
    'Cars Prices': clean_currency,
    'CC/Battery Capacity': clean_unit,
    'HorsePower': clean_unit,
    'Total Speed': clean_unit,
    'Performance(0 - 100 )KM/H': clean_unit,
    'Torque': clean_unit
}

for col, func in cols_to_clean.items():
    df_clean[col] = df_clean[col].apply(func)

# Khusus Seats (pastikan numerik)
df_clean['Seats'] = pd.to_numeric(df_clean['Seats'], errors='coerce')

# --- 3. FILL MISSING VALUES ---
# Atribut numerik inti yang akan digunakan untuk analisis
initial_numeric_cols = [
    'CC/Battery Capacity', 'HorsePower', 'Total Speed', 
    'Performance(0 - 100 )KM/H', 'Cars Prices', 'Torque', 'Seats'
]

# Isi NaN dengan Median (lebih aman daripada Mean untuk outlier)
for col in initial_numeric_cols:
    median_val = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(median_val)

print("Data Cleaning Selesai.")
print("   Sample data bersih:")
display(df_clean[initial_numeric_cols].head())

In [None]:
from sklearn.preprocessing import StandardScaler

# --- Z-SCORE SCALING (STANDARDIZATION) ---
# Penting dilakukan SEBELUM PCA agar satuan (misal Harga vs Detik) setara.

print(" Melakukan Scaling pada atribut:", initial_numeric_cols)

scaler = StandardScaler()
# Transformasi data
X_scaled_array = scaler.fit_transform(df_clean[initial_numeric_cols])

# Simpan dalam DataFrame agar nama kolom tetap ada
df_scaled_all = pd.DataFrame(X_scaled_array, columns=initial_numeric_cols)

print("\n Z-Score Scaling Selesai.")


# PCA

In [None]:
from sklearn.decomposition import PCA

# --- 1. CEK IMPORTANCE FITUR ---
# Gunakan data yang SUDAH di-scaling
pca_temp = PCA(n_components=2)
pca_temp.fit(df_scaled_all)

# Analisa Loadings (Kontribusi fitur terhadap Principal Components)
loadings = pd.DataFrame(pca_temp.components_.T, index=initial_numeric_cols, columns=['PC1', 'PC2'])
loadings['Importance'] = loadings.abs().max(axis=1) # Ambil nilai absolut terbesar

# --- 2. SELEKSI FITUR ---
# Threshold: Fitur dengan importance < 0.40 akan dibuang
threshold = 0.40 
selected_features = loadings[loadings['Importance'] >= threshold].index.tolist()
eliminated_features = loadings[loadings['Importance'] < threshold].index.tolist()

print("="*40)
print(f"FITUR TERPILIH ({len(selected_features)}): {selected_features}")
print(f"FITUR DIBUANG  ({len(eliminated_features)}): {eliminated_features}")
print("="*40)

# --- 3. PCA FINAL (REDUKSI DIMENSI) ---
print(f"\n Menjalankan PCA Final hanya dengan {len(selected_features)} fitur...")

# Ambil subset data scaled yang terpilih saja
X_scaled_final = df_scaled_all[selected_features]

# Jalankan PCA Final
pca_final = PCA(n_components=2)
X_pca = pca_final.fit_transform(X_scaled_final)

# DataFrame Hasil PCA
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
print(" PCA Final Selesai.")

In [None]:
# Plot Sebaran Data PCA
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.scatterplot(x='PC1', y='PC2', data=pca_df, alpha=0.6, color='royalblue')
plt.title(f'Sebaran Data PCA\n(Explained Variance: {sum(pca_final.explained_variance_ratio_)*100:.1f}%)')
plt.xlabel(f'PC1 ({pca_final.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca_final.explained_variance_ratio_[1]*100:.1f}%)')
plt.grid(True)

# Plot Heatmap Loadings (Kontribusi Atribut)
plt.subplot(1, 2, 2)
loadings_final = pd.DataFrame(pca_final.components_.T, columns=['PC1', 'PC2'], index=selected_features)
sns.heatmap(loadings_final, annot=True, cmap='coolwarm', center=0, fmt=".2f")
plt.title('Kontribusi Atribut Terpilih')

plt.tight_layout()
plt.show()

# Elbow Method

In [None]:
from sklearn.cluster import KMeans
import math

# 1. Hitung WCSS (Within-Cluster Sum of Squares)
wcss = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)

# 2. LOGIKA OTOMATIS: Jarak Terjauh dari Garis Diagonal
p1_x, p1_y = 1, wcss[0]       # Titik Awal
p2_x, p2_y = 10, wcss[-1]     # Titik Akhir

distances = []
for i in range(len(wcss)):
    p0_x = i + 1
    p0_y = wcss[i]
    # Rumus jarak titik ke garis
    numerator = abs((p2_y - p1_y) * p0_x - (p2_x - p1_x) * p0_y + p2_x * p1_y - p2_y * p1_x)
    denominator = math.sqrt((p2_y - p1_y)**2 + (p2_x - p1_x)**2)
    distances.append(numerator / denominator)

# K optimal = index jarak terbesar + 1
k_optimal = distances.index(max(distances)) + 1
print(f"‚úÖ K Optimal ditemukan secara otomatis: {k_optimal}")

# 3. Plot Elbow
plt.figure(figsize=(10, 4))
plt.plot(k_range, wcss, marker='o', linestyle='--')
plt.plot(k_optimal, wcss[k_optimal-1], marker='*', color='red', markersize=20, label=f'Optimal K={k_optimal}')
plt.title('Elbow Method (Auto-Detection)')
plt.xlabel('Jumlah Cluster (K)')
plt.ylabel('WCSS')
plt.legend()
plt.grid(True)
plt.show()

# K Means

In [None]:
import time

start_time = time.time()

# Eksekusi K-Means Final
kmeans_final = KMeans(n_clusters=k_optimal, init='k-means++', random_state=42, n_init=10)
clusters = kmeans_final.fit_predict(X_pca)

# --- SIMPAN HASIL ---
# 1. Ke DataFrame PCA (untuk plotting)
pca_df['Cluster'] = clusters

# 2. Ke DataFrame Bersih (untuk profiling statistik)
df_clean['Cluster_Label'] = clusters

# 3. Ke DataFrame Asli (untuk hasil export)
df['Cluster_Label'] = clusters

print(f"Clustering Selesai dalam {time.time() - start_time:.4f} detik.")
print(f"   Data dikelompokkan menjadi {k_optimal} cluster.")

In [None]:
from sklearn.metrics import silhouette_score

# Hitung Silhouette Score
score = silhouette_score(X_pca, clusters)
print(f"üìä Silhouette Score: {score:.4f} (Range -1 s/d 1, semakin dekat 1 semakin baik)")

# Visualisasi Cluster
plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', data=pca_df, palette='viridis', s=80, alpha=0.8)
plt.scatter(kmeans_final.cluster_centers_[:, 0], kmeans_final.cluster_centers_[:, 1], 
            s=250, c='red', marker='X', label='Centroids')

plt.title(f'Hasil Clustering (K={k_optimal})\nSilhouette: {score:.3f}')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
output_file = 'Hasil/Clustered_CarsDataset.csv'
df.to_csv(output_file, index=False)
print(f"‚úÖ File hasil export tersimpan di: {output_file}")

# Profiling atau memberi  nama

In [None]:
import textwrap

# --- LOGIKA PENAMAAN SEGMEN ---
def get_segment_name(row):
    hp = row['HorsePower']
    speed = row['Total Speed']
    accel = row['Performance(0 - 100 )KM/H']
    
    # Logika Hierarki Performa
    if hp > 700 or speed > 330:
        return "HYPERCAR / EXTREME TRACK TOY"
    elif hp > 500 or (speed > 290 and accel < 4.0):
        return "SUPERCAR / HIGH-PERFORMANCE GT"
    elif hp > 300 or accel < 6.0:
        return "SPORTS / PERFORMANCE LUXURY"
    elif hp > 170:
        return "MID-RANGE / PREMIUM DAILY"
    else:
        return "ECONOMY / ENTRY LEVEL"

# --- AGREGASI DATA ---
# Kelompokkan berdasarkan Cluster Label
cluster_stats = df_clean.groupby('Cluster_Label')[initial_numeric_cols].mean()
cluster_counts = df_clean['Cluster_Label'].value_counts().sort_index()

# Tambahkan Min/Max Seats untuk info tambahan
seats_min = df_clean.groupby('Cluster_Label')['Seats'].min()
seats_max = df_clean.groupby('Cluster_Label')['Seats'].max()

print("="*60)
print("             PROFILING CLUSTER OTOMATIS")
print("="*60)

for label in cluster_stats.index:
    stats = cluster_stats.loc[label]
    count = cluster_counts[label]
    name = get_segment_name(stats)
    
    # Format range kursi
    s_min, s_max = int(seats_min[label]), int(seats_max[label])
    seat_str = f"{s_min} Seats" if s_min == s_max else f"{s_min}-{s_max} Seats"
    
    print(f"\nüî∑ CLUSTER {label} | Jumlah: {count} Unit")
    print(f"üè∑Ô∏è  KATEGORI: {name}")
    print("-" * 50)
    print(f"   ‚Ä¢ HorsePower (Avg)  : {stats['HorsePower']:,.0f} HP")
    print(f"   ‚Ä¢ Top Speed (Avg)   : {stats['Total Speed']:,.0f} km/h")
    print(f"   ‚Ä¢ 0-100 km/h (Avg)  : {stats['Performance(0 - 100 )KM/H']:.2f} detik")
    print(f"   ‚Ä¢ Torque (Avg)      : {stats['Torque']:,.0f} Nm")
    print(f"   ‚Ä¢ Kapasitas Mesin   : {stats['CC/Battery Capacity']:,.0f}")
    print(f"   ‚Ä¢ Kapasitas Kursi   : {seat_str}")
    print(f"   ‚Ä¢ Kisaran Harga     : ${stats['Cars Prices']:,.2f}")

# --- VISUALISASI PERBANDINGAN ---
cols_plot = ['HorsePower', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Cars Prices']
plt.figure(figsize=(20, 4))

for i, col in enumerate(cols_plot):
    plt.subplot(1, 4, i+1)
    sns.barplot(x=cluster_stats.index, y=cluster_stats[col], palette='magma')
    plt.title(f'Avg {col}')
    plt.xlabel('Cluster')
    plt.ylabel('')
    plt.grid(axis='y', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()