# Pemberian Label Mobil Dengan K MEANS

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
import time
import os

# Mengabaikan pesan warning
warnings.filterwarnings('ignore')

In [None]:
file_path = 'Dataset/CarsDataset2025.csv' 

try:
    df = pd.read_csv(file_path, encoding='latin-1')
    print(f"Data dimuat: {df.shape[0]} baris, {df.shape[1]} kolom.")
    display(df[['Company Names', 'Cars Names', 'Cars Prices', 'Seats']].head())
except FileNotFoundError:
    print("File tidak ditemukan. Cek path file kembali.")

In [None]:
if df is not None:
    # Gabung Nama
    df['Company Names'] = df['Company Names'].astype(str)
    df['Cars Names'] = df['Cars Names'].astype(str)
    df['Car_Full_Name'] = df['Company Names'] + " " + df['Cars Names']
    
    # Fungsi Bersih Harga
    def clean_price(price_str):
        if pd.isna(price_str): return None
        clean_str = str(price_str).replace('$', '').replace(',', '').replace(' ', '')
        if '-' in clean_str:
            try:
                parts = clean_str.split('-')
                return (float(parts[0]) + float(parts[1])) / 2
            except:
                return None
        try:
            return float(clean_str)
        except:
            return None

    # Terapkan Cleaning
    df['Price_Cleaned'] = df['Cars Prices'].apply(clean_price)
    df['Seats_Cleaned'] = pd.to_numeric(df['Seats'], errors='coerce')

    # Hapus NaN
    df_clean = df.dropna(subset=['Price_Cleaned', 'Seats_Cleaned']).reset_index(drop=True)
    
    print(f"Data bersih: {len(df_clean)} baris.")
    display(df_clean[['Car_Full_Name', 'Price_Cleaned', 'Seats_Cleaned']].head())

In [None]:
# Fitur yang dipakai
X = df_clean[['Price_Cleaned', 'Seats_Cleaned']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Preview data setelah diskalakan
df_scaled_preview = pd.DataFrame(X_scaled, columns=['Price_Scaled', 'Seats_Scaled'])
print("Data berhasil dinormalisasi (Z-Score).")
display(df_scaled_preview.head())

In [None]:
print("Mencari jumlah cluster (K) optimal menggunakan Elbow Method...")

# Range nilai K yang akan diuji
k_range = range(2, 11)  # Menguji K dari 2 sampai 10

# Menyimpan nilai inersia (WCSS) untuk setiap K
inertia_values = []

# Menghitung inersia untuk setiap nilai K
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia_values.append(kmeans.inertia_)

# Membuat plot Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertia_values, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)', fontsize=12)
plt.ylabel('Within-Cluster Sum of Squares (WCSS)', fontsize=12)
plt.title('Elbow Method untuk Menentukan K Optimal', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(k_range)

# Menambahkan anotasi nilai inersia pada setiap titik
for i, (k, inertia) in enumerate(zip(k_range, inertia_values)):
    plt.annotate(f'{inertia:.1f}', 
                 (k, inertia), 
                 textcoords="offset points", 
                 xytext=(0,10), 
                 ha='center',
                 fontsize=9)

plt.tight_layout()
plt.show()

# Menampilkan nilai inersia dalam bentuk tabel
print("\nNilai WCSS (Within-Cluster Sum of Squares) untuk setiap K:")
inertia_df = pd.DataFrame({
    'K': k_range,
    'WCSS': [f"{x:.2f}" for x in inertia_values],
    'Penurunan WCSS': ['-'] + [f"{(inertia_values[i-1] - inertia_values[i]):.2f}" 
                             for i in range(1, len(inertia_values))]
})
display(inertia_df)

# Menganalisis elbow point
# Metode: mencari titik dimana penurunan inersia mulai melambat
wcss_diff = [inertia_values[i-1] - inertia_values[i] for i in range(1, len(inertia_values))]
wcss_diff_diff = [wcss_diff[i-1] - wcss_diff[i] for i in range(1, len(wcss_diff))]

# Mencari elbow point (titik dimana penurunan kedua terbesar)
recommended_k = 3 # Default fallback jika kalkulasi gagal
if len(wcss_diff_diff) > 0:
    elbow_point = np.argmax(wcss_diff_diff) + 2  # +2 karena mulai dari K=2
    recommended_k = k_range[elbow_point]
    
    print(f"\nRekomendasi K berdasarkan Elbow Method: {recommended_k}")
    print(f"- Pada K = {recommended_k}, terjadi perlambatan signifikan dalam penurunan WCSS")
else:
    print("\nTidak dapat menentukan elbow point secara otomatis, menggunakan default K=3")

# Opsional: Visualisasi dengan metode derivative untuk melihat elbow point lebih jelas
plt.figure(figsize=(12, 5))

# Subplot 1: Elbow curve normal
plt.subplot(1, 2, 1)
plt.plot(k_range, inertia_values, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS')
plt.title('Elbow Curve')
plt.grid(True, alpha=0.3)

# Subplot 2: Rate of change of WCSS
plt.subplot(1, 2, 2)
plt.plot(k_range[1:], wcss_diff, 'ro-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Rate of Change of WCSS')
plt.title('Rate of Change Analysis')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*50)
print("INTERPRETASI ELBOW METHOD:")
print("- Pilih K dimana penambahan cluster tidak lagi memberikan")
print("  penurunan WCSS yang signifikan (titik siku/elbow)")
print("- WCSS yang lebih rendah menunjukkan clustering yang lebih baik")
print("- Balance antara jumlah cluster dan kompleksitas model")
print("="*50)

In [None]:
# Gunakan K dari rekomendasi di atas
k_final = recommended_k 

print(f"=== Menjalankan K-Means (K={k_final}) ===")

# 1. Mulai Timer
start_time = time.time()

# 2. Fit Model
kmeans = KMeans(n_clusters=k_final, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)

# 3. Stop Timer
end_time = time.time()
exec_time = end_time - start_time

# 4. Hitung Metrik Evaluasi
sil_score = silhouette_score(X_scaled, labels)
dbi_score = davies_bouldin_score(X_scaled, labels)

# Masukkan label ke dataframe
df_clean['cluster'] = labels

print("\n=== HASIL EVALUASI ===")
print(f"• Waktu Eksekusi   : {exec_time:.6f} detik")
print(f"• Silhouette Score : {sil_score:.4f} (Semakin Mendekati 1 Lebih Baik)")
print(f"• Davies-Bouldin   : {dbi_score:.4f} (Semakin Mendekati 0.5 Lebih Baik)")

# 5. Visualisasi
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_clean, x='Seats_Cleaned', y='Price_Cleaned', 
                hue='cluster', palette='viridis', s=100, edgecolor='black')
plt.title(f'Hasil Clustering Mobil (K={k_final})')
plt.xlabel('Jumlah Kursi')
plt.ylabel('Harga ($)')
plt.legend(title='Cluster')
plt.grid(True, alpha=0.3)
plt.show()

# 6. Export ke CSV
output_dir = 'Hasil'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Siapkan dataframe final (Hapus kolom bantuan cleaning jika tidak perlu, simpan yang penting)
df_export = df_clean.copy()
file_name = f'KMeans_Result_K{k_final}.csv'
csv_path = os.path.join(output_dir, file_name)

df_export.to_csv(csv_path, index=False)
print(f"\n✅ File CSV berhasil disimpan di: {csv_path}")
print(f"   (Berisi atribut asli + kolom 'cluster')")

# PROFIL HASIL CLUSTER

In [None]:
# Grouping berdasarkan cluster untuk melihat karakteristik
cluster_summary = df_clean.groupby('cluster')[['Price_Cleaned', 'Seats_Cleaned']].agg(['mean', 'min', 'max', 'count'])

# Format agar enak dibaca
pd.options.display.float_format = '{:,.2f}'.format
display(cluster_summary)

print("\nInterpretasi Singkat (Contoh Cara Membaca):")
for i in range(len(cluster_summary)):
    avg_price = cluster_summary.loc[i, ('Price_Cleaned', 'mean')]
    avg_seat = cluster_summary.loc[i, ('Seats_Cleaned', 'mean')]
    count = cluster_summary.loc[i, ('Price_Cleaned', 'count')]
    print(f"- Cluster {i}: Rata-rata Harga ${avg_price:,.0f}, Rata-rata Kursi {avg_seat:.1f} ({count} mobil)")