In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Impor semua model yang akan digunakan
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# --- 1. Memuat dan Mempersiapkan Data ---
# Menggunakan path dari file yang diunggah
file_path = 'C:\\Bisa.AI\\BMW_Car_Sales\\archive\\BMW_Car_Sales_Classification.csv'
df = pd.read_csv(file_path)

# Memisahkan fitur (X) dan target (y)
X = df.drop('Sales_Classification', axis=1)
y = df['Sales_Classification']

# Mengubah label target menjadi numerik ('Low' -> 0, 'High' -> 1)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# --- 2. Preprocessing ---
# Mengidentifikasi kolom numerik dan kategorikal
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Membuat pipeline preprocessing
# Untuk data numerik: scaling (menyamakan skala)
# Untuk data kategorikal: one-hot encoding (mengubah teks jadi angka)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# --- 3. Membagi Data ---
# 80% data untuk training, 20% untuk testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# --- 4. Definisi, Pelatihan, dan Evaluasi 5 Model ---
# Mendefinisikan model-model yang akan dibandingkan
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}

print("--- Hasil Evaluasi Model ---")
for name, model in models.items():
    # Membuat pipeline lengkap: preprocessing -> model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])

    # Melatih model dengan data training
    pipeline.fit(X_train, y_train)

    # Memprediksi data test
    y_pred = pipeline.predict(X_test)

    # Mengevaluasi performa
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_)

    results[name] = accuracy

    print(f"\nModel: {name}")
    print(f"Akurasi: {accuracy:.4f}")
    print("Laporan Klasifikasi:")
    print(report)
    print("-" * 30)

# --- 5. Memilih dan Menyimpan Model Terbaik ---
best_model_name = max(results, key=results.get)
best_model_accuracy = results[best_model_name]
print(f"\n--- Pemilihan Model Terbaik ---")
print(f"Model terbaik adalah: {best_model_name} dengan akurasi {best_model_accuracy:.4f}")

# Mengambil arsitektur model terbaik yang sudah ditentukan
best_model_arch = models[best_model_name]

# Membuat pipeline final untuk model terbaik
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', best_model_arch)])

# Melatih ulang model terbaik menggunakan SELURUH data agar lebih pintar
final_pipeline.fit(X, y_encoded)
print("\nModel terbaik telah dilatih ulang menggunakan seluruh dataset.")

# Menyimpan pipeline yang sudah dilatih ke file .pkl
model_filename = 'best_bmw_sales_model.pkl'
joblib.dump(final_pipeline, model_filename)
print(f"Model terbaik berhasil disimpan sebagai '{model_filename}'")

--- Hasil Evaluasi Model ---

Model: Logistic Regression
Akurasi: 0.9987
Laporan Klasifikasi:
              precision    recall  f1-score   support

        High       1.00      1.00      1.00      3049
         Low       1.00      1.00      1.00      6951

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

------------------------------

Model: K-Nearest Neighbors
Akurasi: 0.9249
Laporan Klasifikasi:
              precision    recall  f1-score   support

        High       0.89      0.87      0.88      3049
         Low       0.94      0.95      0.95      6951

    accuracy                           0.92     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.92      0.92      0.92     10000

------------------------------

Model: Decision Tree
Akurasi: 1.0000
Laporan Klasifikasi:
              precision    recall  f1-score   support

        Hig