In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# 1. Load Dataset
df = pd.read_csv('co2.csv')

# 2. Preprocessing
categorical_cols = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

features = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type',
            'Engine Size(L)', 'Cylinders', 'Fuel Consumption Comb (L/100 km)']
X = df[features]
y = df['CO2 Emissions(g/km)']

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#HYPERPARAMETER TUNING
print("Sedang tuning")

# Kita set kandidat settingan yang mau dites
param_grid = {
    'n_estimators': [100, 200],      # Jumlah 'tree' keputusan
    'learning_rate': [0.01, 0.1, 0.2], # Seberapa cepat model belajar
    'max_depth': [3, 5, 7],          # Kedalaman tree
    'subsample': [0.8, 1.0]          # Variasi data yang dipakai
}

# Inisialisasi model dasar
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', seed=42)

# Pakai GridSearchCV buat cek semua settingan
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=1, scoring='r2')

grid_search.fit(X_train, y_train)

# Ambil model terbaik
best_model = grid_search.best_estimator_

print(f"Tuning Selesai!")
print(f"Settingan Terbaik: {grid_search.best_params_}")

# 4. Evaluasi Model Terbaik
predictions = best_model.predict(X_test)
print(f"\nAkurasi R2 Score (Tuned): {r2_score(y_test, predictions) * 100:.2f}%")
print(f"MAE: {mean_absolute_error(y_test, predictions):.2f} gram/km")

# 5. Simpan Model Terbaik
joblib.dump(best_model, 'co2_xgb_model.pkl')
joblib.dump(encoders, 'label_encoders.pkl')
print("Model hasil tuning berhasil disimpan!")

Sedang melakukan tuning..
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Tuning Selesai!
Settingan Terbaik: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

Akurasi R2 Score (Tuned): 99.75%
MAE: 2.01 gram/km
Model hasil tuning berhasil disimpan!
