In [16]:
import numpy as np
import pandas as pd

# Test the data first
X_train = pd.read_csv('../data_csv/X_train_final.csv')
X_test = pd.read_csv('../data_csv/X_test_final.csv')
y_train = pd.read_csv('../data_csv/y_train_final.csv')
y_test = pd.read_csv('../data_csv/y_test_final.csv')

# Check the size
print(X_train.shape)
print(X_test.shape)

(227986, 51)
(56997, 51)


## Try Hyperparameter Tuning using RandomizedSearchCV

In [17]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.7, 0.8, 0.9, 1.0]
}

In [18]:
from xgboost import XGBRegressor

xgb_base = XGBRegressor(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=20, 
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42,
    verbose=1  
)

# Fit (Takes time)
print("Start trying")
random_search.fit(X_train,y_train)
print("Done!")

Start trying
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Done!


In [19]:
# Print out the best parameters with best model
print(random_search.best_params_)

# Take the best model
best_xgb_model = random_search.best_estimator_

{'subsample': 0.9, 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.1}


In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred_log_best_xgb = best_xgb_model.predict(X_test)

y_test_original_best_xgb = np.expm1(y_test)
y_pred_original_best_xgb = np.expm1(y_pred_log_best_xgb)

# Tính toán chỉ số mới
r2_best_xgb = r2_score(y_test_original_best_xgb, y_pred_original_best_xgb)
mae_best_xgb = mean_absolute_error(y_test_original_best_xgb, y_pred_original_best_xgb)
rmse_best_xgb = np.sqrt(mean_squared_error(y_test_original_best_xgb, y_pred_original_best_xgb))

print(f"\n--- XGBoost Results ---")
print(f"R-squared (R²): {r2_best_xgb:.4f}")
print(f"MAE: ${mae_best_xgb:,.2f}")
print(f"RMSE: ${rmse_best_xgb:,.2f}")


--- XGBoost Results ---
R-squared (R²): 0.8547
MAE: $2,890.03
RMSE: $5,552.48


In [21]:
import joblib
import os

os.makedirs('../models', exist_ok=True)
joblib.dump(best_xgb_model, '../models/best_xgb_model.pkl')

['../models/best_xgb_model.pkl']