In [1]:
import xgboost as xgb
import sys
import pandas as pd
import optuna
import mlflow
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from numpy import sqrt

print(sys.executable)

  from .autonotebook import tqdm as notebook_tqdm


g:\DEV_ENV\Real Estate Pricing\.venv\Scripts\python.exe


In [2]:
train_df = pd.read_csv('../data/processed/HouseTS_train_FE.csv')
validation_df = pd.read_csv('../data/processed/HouseTS_validation_FE.csv')

In [3]:
target = "price"
X_train = train_df.drop(columns=[target])
y_train = train_df[target]

X_validation = validation_df.drop(columns=[target])
y_validation = validation_df[target]

print(f"Training data shape: {X_train.shape}, Validation data shape: {X_validation.shape}")

Training data shape: (610193, 39), Validation data shape: (131846, 39)


### Defining Optuna with MLflow Callback

In [4]:
mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment("xgboost_real_estate_pricing")

  return FileStore(store_uri, store_uri)
2026/01/15 00:21:28 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_real_estate_pricing' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///g:/DEV_ENV/Real Estate Pricing/notebooks/../mlruns/803831897141185574', creation_time=1768429288799, experiment_id='803831897141185574', last_update_time=1768429288799, lifecycle_stage='active', name='xgboost_real_estate_pricing', tags={}>

In [5]:
def optuna_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    with mlflow.start_run():
        xgb_model = xgb.XGBRegressor(**params)
        xgb_model.fit(X_train, y_train)
        
        y_pred = xgb_model.predict(X_validation)
        mae = mean_absolute_error(y_validation, y_pred)
        rmse = sqrt(mean_squared_error(y_validation, y_pred))
        r2 = r2_score(y_validation, y_pred)

        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    return rmse


In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(optuna_objective, n_trials=15)

print("Best params:", study.best_trial.params)

[I 2026-01-15 00:21:28,844] A new study created in memory with name: no-name-1868616c-68b0-46be-871e-65b17dfe8b1b
[I 2026-01-15 00:21:52,431] Trial 0 finished with value: 79348.61155282879 and parameters: {'n_estimators': 838, 'max_depth': 6, 'learning_rate': 0.014544350930263412, 'subsample': 0.7918263144898543, 'colsample_bytree': 0.9221274407232388, 'min_child_weight': 10, 'gamma': 1.1985471196513071, 'reg_alpha': 1.1998034626899048e-07, 'reg_lambda': 5.5037593551630434e-05}. Best is trial 0 with value: 79348.61155282879.
[I 2026-01-15 00:21:57,826] Trial 1 finished with value: 89563.69906548833 and parameters: {'n_estimators': 244, 'max_depth': 3, 'learning_rate': 0.04780141171194553, 'subsample': 0.6795534266083456, 'colsample_bytree': 0.9726103049249191, 'min_child_weight': 8, 'gamma': 1.4775270686478426, 'reg_alpha': 8.467877255902732, 'reg_lambda': 0.0020250028869350204}. Best is trial 0 with value: 79348.61155282879.
[I 2026-01-15 00:22:06,827] Trial 2 finished with value: 731

Best params: {'n_estimators': 490, 'max_depth': 6, 'learning_rate': 0.0948538916494201, 'subsample': 0.697135179850908, 'colsample_bytree': 0.5044451046278728, 'min_child_weight': 1, 'gamma': 3.4068111632208415, 'reg_alpha': 3.858451831155388e-06, 'reg_lambda': 3.6955724315668136e-06}


In [11]:
# training final model with best hyperparameters
best_params = study.best_trial.params
xgb_best_model = xgb.XGBRegressor(**best_params)
xgb_best_model.fit(X_train, y_train)

y_pred = xgb_best_model.predict(X_validation)
mae = mean_absolute_error(y_validation, y_pred)
rmse = sqrt(mean_squared_error(y_validation, y_pred))
r2 = r2_score(y_validation, y_pred)
print(f"Final Model Performance on Validation Set: RMSE={rmse}, MAE={mae}, R2={r2}")

# log final model
with mlflow.start_run(run_name="final_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.sklearn.log_model(
    xgb_best_model,
    artifact_path="xgboost_model"
)



Final Model Performance on Validation Set: RMSE=71676.08143417657, MAE=39433.34650741196, R2=0.9630621806832369




run `mlflow ui` in terminal to visualize results

In [13]:
# Save the trained model
import joblib
joblib.dump(xgb_best_model, '../models/xgb_best_model.pkl')

['../models/xgb_best_model.pkl']