# Clustering Mobil

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch
import warnings

# Mengabaikan pesan warning agar output bersih
warnings.filterwarnings('ignore')

print("Semua library (Pandas, NumPy, Matplotlib, Seaborn, Sklearn) telah diimpor.")

Semua library (Pandas, NumPy, Matplotlib, Seaborn, Sklearn) telah diimpor.


In [None]:
file_path = 'Dataset/CarsDataset2025.csv' 

try:
    df = pd.read_csv(file_path, encoding='latin-1')
    print(f"Data dimuat dengan {df.shape[0]} baris dan {df.shape[1]} kolom.")
    print("Contoh Baris Dalam Atribut yang Akan digunakan:")
    display(df[['Company Names', 'Cars Names', 'Cars Prices', 'Seats']].head(10))
except FileNotFoundError:
    print("File tidak ditemukan. Pastikan nama file dan path sudah benar.")

Data dimuat dengan 1218 baris dan 11 kolom.
Contoh Baris Dalam Atribut yang Akan digunakan:


Unnamed: 0,Company Names,Cars Names,Cars Prices,Seats
0,FERRARI,SF90 STRADALE,"$1,100,000",2
1,ROLLS ROYCE,PHANTOM,"$460,000",5
2,Ford,KA+,"$12,000-$15,000",5
3,MERCEDES,GT 63 S,"$161,000",4
4,AUDI,AUDI R8 Gt,"$253,290",2
5,BMW,Mclaren 720s,"$499,000",2
6,ASTON MARTIN,VANTAGE F1,"$193,440",2
7,BENTLEY,Continental GT Azure,"$311,000",4
8,LAMBORGHINI,VENENO ROADSTER,"$4,500,000",2
9,FERRARI,F8 TRIBUTO,"$280,000",2


In [33]:
if df is not None:
    # 1. Menggabungkan Nama Perusahaan dan Nama Mobil
    # Kita ubah menjadi string dulu untuk menghindari error jika ada data non-string
    df['Company Names'] = df['Company Names'].astype(str)
    df['Cars Names'] = df['Cars Names'].astype(str)
    df['Car_Full_Name'] = df['Company Names'] + " " + df['Cars Names']
    
    # 2. Fungsi Membersihkan Harga
    def clean_price(price_str):
        if pd.isna(price_str): return None
        # Hapus simbol $, koma, spasi, dan karakter aneh
        clean_str = str(price_str).replace('$', '').replace(',', '').replace(' ', '')
        
        # Menangani rentang harga (contoh: "12000-15000") -> Ambil rata-rata
        if '-' in clean_str:
            try:
                parts = clean_str.split('-')
                low = float(parts[0])
                high = float(parts[1])
                return (low + high) / 2
            except:
                return None
        try:
            return float(clean_str)
        except:
            return None

    # Terapkan pembersihan
    df['Price_Cleaned'] = df['Cars Prices'].apply(clean_price)
    df['Seats_Cleaned'] = pd.to_numeric(df['Seats'], errors='coerce')

    # Hapus baris yang datanya kosong/rusak (NaN)
    df_clean = df.dropna(subset=['Price_Cleaned', 'Seats_Cleaned']).reset_index(drop=True)
        # Menampilkan 10 data pertama
    display_cols = ['Car_Full_Name', 'Price_Cleaned', 'Seats_Cleaned']
    display(df_clean[display_cols].head(10))

     # Menampilkan info ringkas
    print(f"\nRingkasan:")
    print(f"• Total data valid: {len(df_clean)} mobil")
    print(f"• Rentang harga: ${df_clean['Price_Cleaned'].min():,.0f} - ${df_clean['Price_Cleaned'].max():,.0f}")
    print(f"• Rentang kursi: {df_clean['Seats_Cleaned'].min()} - {df_clean['Seats_Cleaned'].max()}")

Unnamed: 0,Car_Full_Name,Price_Cleaned,Seats_Cleaned
0,FERRARI SF90 STRADALE,1100000.0,2.0
1,ROLLS ROYCE PHANTOM,460000.0,5.0
2,Ford KA+,13500.0,5.0
3,MERCEDES GT 63 S,161000.0,4.0
4,AUDI AUDI R8 Gt,253290.0,2.0
5,BMW Mclaren 720s,499000.0,2.0
6,ASTON MARTIN VANTAGE F1,193440.0,2.0
7,BENTLEY Continental GT Azure,311000.0,4.0
8,LAMBORGHINI VENENO ROADSTER,4500000.0,2.0
9,FERRARI F8 TRIBUTO,280000.0,2.0



Ringkasan:
• Total data valid: 1200 mobil
• Rentang harga: $4,000 - $18,000,000
• Rentang kursi: 1.0 - 20.0


In [26]:
X = df_clean[['Price_Cleaned', 'Seats_Cleaned']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Membuat DataFrame untuk data yang sudah diskalakan agar mudah dilihat
df_scaled = pd.DataFrame(X_scaled, columns=['Price_Scaled', 'Seats_Scaled'])

print("✅ CELL 4 BERHASIL: Data telah dinormalisasi dengan Z-Score.")
print(df_scaled.head(3))

✅ CELL 4 BERHASIL: Data telah dinormalisasi dengan Z-Score.
   Price_Scaled  Seats_Scaled
0      1.343128     -1.900769
1      0.448876      0.093626
2     -0.175004      0.093626
