In [4]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib # Buat simpan model

# 1. Load Dataset
df = pd.read_csv('co2.csv')

# 2. Preprocessing (Ubah Teks jadi Angka)
# Kita drop 'Model' karena terlalu spesifik (terlalu banyak variasi),
# tapi kita pertahankan 'Make' (Merek), 'Vehicle Class', dll.
categorical_cols = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type']
encoders = {} # Simpan encoder biar nanti aplikasi MVP tau cara baca input user

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le # Simpan encodernya

# Tentukan Fitur (X) dan Target (y)
# Kita pakai kombinasi BBM (Comb) karena paling relevan
features = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type',
            'Engine Size(L)', 'Cylinders', 'Fuel Consumption Comb (L/100 km)']
X = df[features]
y = df['CO2 Emissions(g/km)']

# 3. Split Data (80% Latih, 20% Uji)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, seed=42)
model.fit(X_train, y_train)

# 5. Cek Akurasi (Syarat Excellent > 85%)
predictions = model.predict(X_test)
print(f"R2: {r2_score(y_test, predictions) * 100:.2f}%")
print(f"MAE: {mean_absolute_error(y_test, predictions):.2f} gram/km")

# 6. Simpan Model & Encoder untuk Aplikasi MVP
joblib.dump(model, 'co2_xgb_model.pkl')
joblib.dump(encoders, 'label_encoders.pkl')
print("Model berhasil disimpan! Siap dipakai buat bikin Web App.")

R2: 99.75%
MAE: 1.92 gram/km
Model berhasil disimpan! Siap dipakai buat bikin Web App.
