In [1]:
import numpy as np
import pandas as pd

# Test the data first
X_train = pd.read_csv('../data_csv/X_train_final.csv')
X_test = pd.read_csv('../data_csv/X_test_final.csv')
y_train = pd.read_csv('../data_csv/y_train_final.csv')
y_test = pd.read_csv('../data_csv/y_test_final.csv')

# Check the size
print(X_train.shape)
print(X_test.shape)

(227986, 51)
(56997, 51)


In [2]:
param_grid_rf = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20], 
    'max_features': ['sqrt', 0.5],
    'min_samples_leaf': [2, 4, 6]
}

In [3]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)

# 2. Khởi tạo công cụ tìm kiếm
random_search_rf = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_grid_rf,
    n_iter=10, 
    cv=3,     
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1 
)

print("Start training: Loading....")
random_search_rf.fit(X_train, y_train)
print("Done!")

Start training: Loading....
Fitting 3 folds for each of 10 candidates, totalling 30 fits


  return fit_method(estimator, *args, **kwargs)


Done!


In [4]:
# Best hyperparameters
print(random_search_rf.best_params_)

# Take the best model
best_rf_model = random_search_rf.best_estimator_

{'n_estimators': 150, 'min_samples_leaf': 4, 'max_features': 0.5, 'max_depth': 20}


In [5]:
# Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict in X_test
y_pred_log = best_rf_model.predict(X_test)

# Convert to normal curve instead of log
y_test_original = np.expm1(y_test)
y_pred_original = np.expm1(y_pred_log)

# Calculate
r2 = r2_score(y_test_original, y_pred_original)
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))

# Print out
print("--- Result of Ridge Regression ---")
print(f"R-squared (R²): {r2:.4f}")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")

--- Result of Ridge Regression ---
R-squared (R²): 0.8371
MAE: $2,851.01
RMSE: $5,880.05


### Importance of Feature

In [6]:
feature_names = X_train.columns

importances_rf = best_rf_model.feature_importances_

# Create DataFrame
rf_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances_rf
})

# Sắp xếp theo độ quan trọng
rf_importance_df = rf_importance_df.sort_values(by='importance', ascending=False)
print(rf_importance_df.head(15))

                        feature  importance
49                    te__model    0.321001
0              num__year_to_now    0.175276
1             log_num__odometer    0.129142
50         te__state_of_listing    0.067712
3                ord__cylinders    0.064920
2                     ord__size    0.042667
48             te__manufacturer    0.033585
17      ohe__transmission_other    0.029014
18       ohe__type_of_drive_4wd    0.019137
19       ohe__type_of_drive_fwd    0.017376
42             ohe__color_other    0.016016
4              ohe__fuel_diesel    0.007450
15  ohe__transmission_automatic    0.006987
20     ohe__type_of_drive_other    0.006636
30      ohe__generic_type_sedan    0.005626


In [7]:
import joblib
import os

os.makedirs('../models', exist_ok=True)
joblib.dump(best_rf_model, '../models/best_rf_model.pkl')

['../models/best_rf_model.pkl']