2025/05/27 00:08:00 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
[I 2025-05-27 00:08:00,906] A new study created in memory with name: no-name-0a35b904-3909-44bc-80b6-52c4efdbc1ee
[I 2025-05-27 00:26:17,236] Trial 0 finished with value: 1.343638745170265 and parameters: {'depth': 6, 'learning_rate': 0.22648248189516848, 'l2_leaf_reg': 0.8471801418819978, 'subsample': 0.7993292420985183, 'min_data_in_leaf': 16}. Best is trial 0 with value: 1.343638745170265.
[I 2025-05-27 04:40:26,420] Trial 1 finished with value: 2.3688507459365455 and parameters: {'depth': 5, 'learning_rate': 0.0013927723945289009, 'l2_leaf_reg': 2.9154431891537547, 'subsample': 0.8005575058716043, 'min_data_in_leaf': 71}. Best is trial 0 with value: 1.343638745170265.
[I 2025-05-27 04:55:11,683] Trial 2 finished with value: 1.4367631721281549 and parameters: {'depth': 4, 'learning_rate': 0.2526878207508456, 'l2_leaf_reg': 2.1368329072358767, 'subsample': 0.6061695553391381, 'mi

In [None]:
import gc, random, polars as pl, numpy as np, pandas as pd, optuna, mlflow, mlflow.catboost
from pathlib import Path
from catboost import CatBoostRegressor
from catboost.utils import get_gpu_device_count
from mlflow.models import infer_signature
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from optuna.integration.mlflow import MLflowCallback
from optuna.pruners import HyperbandPruner
import warnings, optuna.exceptions as optuna_w

warnings.filterwarnings("ignore", category=optuna_w.ExperimentalWarning)

SEED = 42
random.seed(SEED); np.random.seed(SEED)

DATA_DIR = Path("data_sampled")
TARGET = "tip_amount"
EXPERIMENT = "CatBoost_Optuna"
TRIALS = 10
TIMEOUT_MIN = 90
SPLITS, MAX_ITERS, EARLY_STOP = 3, 7500, 30

try:  gpu_cnt = get_gpu_device_count()
except: gpu_cnt = 0
TASK_TYPE, DEVICES = ("GPU", "0") if gpu_cnt else ("CPU", None)

def prep(f: Path) -> pl.DataFrame:
    df = pl.read_parquet(f, low_memory=True)
    for c in ("tpep_pickup_datetime","pickup_datetime","tpep_dropoff_datetime","dropoff_datetime"):
        if c in df.columns: df = df.with_columns(pl.col(c).cast(pl.Datetime("ns")))
    df = df.filter(pl.col(TARGET) >= 0)
    ns = 60000000000
    pick = next((c for c in ("tpep_pickup_datetime","pickup_datetime") if c in df.columns), None)
    drop = next((c for c in ("tpep_dropoff_datetime","dropoff_datetime") if c in df.columns), None)
    if pick and drop:
        df = (df.with_columns(((pl.col(drop).cast(pl.Int64)-pl.col(pick).cast(pl.Int64))/ns)
                              .cast(pl.Float32).alias("trip_duration_min"))
                .with_columns([
                    pl.col(pick).dt.month().cast(pl.Int8).alias("pickup_month"),
                    pl.col(pick).dt.day().cast(pl.Int8).alias("pickup_day"),
                    pl.col(pick).dt.hour().cast(pl.Int8).alias("pickup_hour"),
                    pl.col(pick).dt.weekday().cast(pl.Int8).alias("pickup_dow")])
                .drop([pick, drop]))
    else:
        df = df.with_columns([
            pl.lit(0).cast(pl.Float32).alias("trip_duration_min"),
            pl.lit(0).cast(pl.Int8).alias("pickup_month"),
            pl.lit(0).cast(pl.Int8).alias("pickup_day"),
            pl.lit(0).cast(pl.Int8).alias("pickup_hour"),
            pl.lit(0).cast(pl.Int8).alias("pickup_dow")])
    for c,t in {"cbd_congestion_fee":pl.Float32,"airport_fee":pl.Float32,"congestion_surcharge":pl.Float32}.items():
        if c not in df.columns: df = df.with_columns(pl.lit(0).cast(t).alias(c))
    int_cats = ["VendorID","RatecodeID","PULocationID","DOLocationID","payment_type",
                "pickup_month","pickup_day","pickup_hour","pickup_dow"]
    for c in int_cats:
        df = df.with_columns((pl.col(c).fill_null(-1) if c in df.columns else pl.lit(-1))
                             .cast(pl.Int32).alias(c))
    if "store_and_fwd_flag" not in df.columns:
        df = df.with_columns(pl.lit("missing").cast(pl.Utf8).alias("store_and_fwd_flag"))
    df = df.with_columns(pl.col("store_and_fwd_flag").fill_null("missing").cast(pl.Categorical))
    print(df.shape)
    return df

ddf = pl.concat([prep(f) for f in sorted(DATA_DIR.glob("*.parquet"))])
pdf = ddf.to_pandas(use_pyarrow_extension_array=True); del ddf; gc.collect()

y = pdf[TARGET]; X = pdf.drop(columns=[TARGET])
cat_cols = ["VendorID","RatecodeID","PULocationID","DOLocationID","payment_type",
            "pickup_month","pickup_day","pickup_hour","pickup_dow","store_and_fwd_flag"]
for c in cat_cols: X[c] = X[c].astype("string").fillna("missing")
num_cols = X.columns.difference(cat_cols); X[num_cols] = X[num_cols].fillna(0).astype("float32")

tscv = TimeSeriesSplit(n_splits=SPLITS)
mlflow.set_experiment(EXPERIMENT)

if mlflow.active_run(): mlflow.end_run()
root = mlflow.start_run(run_name="optuna_catboost", log_system_metrics=True)

mlcb = MLflowCallback(metric_name="val_rmse", create_experiment=False, mlflow_kwargs={"nested": True})
pruner = HyperbandPruner()

@mlcb.track_in_mlflow()
def objective(trial):
    mlflow.set_tag("mlflow.runName", f"trial_{trial.number}")
    params = {"depth": trial.suggest_int("depth",4,10),
              "learning_rate": trial.suggest_float("learning_rate",1e-3,0.3,log=True),
              "l2_leaf_reg": trial.suggest_float("l2_leaf_reg",1e-3,10,log=True),
              "subsample": trial.suggest_float("subsample",0.5,1.0),
              "min_data_in_leaf": trial.suggest_int("min_data_in_leaf",1,100),
              "bootstrap_type":"Bernoulli",
              "iterations":MAX_ITERS,
              "early_stopping_rounds":EARLY_STOP,
              "eval_metric":"RMSE",
              "random_seed":SEED,
              "task_type":TASK_TYPE,
              "devices":DEVICES,
              "verbose":0,
              "cat_features":cat_cols}
    rms, iters = [], []
    for tr, vl in tscv.split(X):
        m = CatBoostRegressor(**params)
        m.fit(X.iloc[tr], y.iloc[tr], eval_set=(X.iloc[vl], y.iloc[vl]), verbose=False)
        rms.append(mean_squared_error(y.iloc[vl], m.predict(X.iloc[vl])))
        iters.append(m.get_best_iteration()); del m; gc.collect()
    cv = float(np.mean(rms)); trial.set_user_attr("best_iterations", int(np.mean(iters)))
    mlflow.log_metric("rmse_cv", cv); mlflow.log_metric("best_iterations", trial.user_attrs["best_iterations"])
    return cv

study = optuna.create_study(study_name="CatBoostOptunaStudy",
                            direction="minimize",
                            sampler=optuna.samplers.TPESampler(seed=SEED),
                            pruner=pruner)

study.optimize(objective,
               n_trials=TRIALS,
               timeout=TIMEOUT_MIN*60,
               callbacks=[mlcb],
               show_progress_bar=True)

best_params = study.best_trial.params
final_iter  = study.best_trial.user_attrs["best_iterations"]
final = {**best_params,
         "iterations": final_iter,
         "random_seed": SEED,
         "task_type": TASK_TYPE,
         "devices": DEVICES,
         "verbose": 0,
         "bootstrap_type":"Bernoulli",
         "cat_features": cat_cols}

model = CatBoostRegressor(**final).fit(X, y, verbose=False)
signature = infer_signature(X.head(100), model.predict(X.head(100)))
mlflow.catboost.log_model(model, "model", signature=signature, input_example=X.head(5))
if Path("catboost_info").exists(): mlflow.log_artifacts("catboost_info", artifact_path="catboost_info")
mlflow.log_metric("best_rmse_cv", study.best_value)
mlflow.end_run()

print(f"Best CV RMSE: {study.best_value:.6f}")

(592901, 24)
(601480, 24)
(716508, 24)
(702827, 24)
(744735, 24)
(707810, 24)
(615358, 24)
(595823, 24)
(726593, 24)
(766731, 24)
(729250, 24)
(733650, 24)


2025/05/27 12:57:44 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/27 12:57:44 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
2025/05/27 12:57:44 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
[I 2025-05-27 12:57:44,947] A new study created in memory with name: CatBoostOptunaStudy


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# plt.figure(figsize=(6, 6))
# sns.scatterplot(x=y_test, y=preds, alpha=0.3)
# plt.xlabel("Actual")
# plt.ylabel("Predicted")
# plt.tight_layout()
# plt.savefig("scatter_actual_vs_pred.png", dpi=300)

# residuals = y_test - preds
# plt.figure(figsize=(6, 4))
# sns.histplot(residuals, bins=100, kde=True)
# plt.xlabel("Residuals")
# plt.tight_layout()
# plt.savefig("residual_hist.png", dpi=300)

# importances = model.get_feature_importance(type="PredictionValuesChange")
# imp_df = pd.DataFrame({"feature": features, "importance": importances}).sort_values("importance", ascending=False)
# plt.figure(figsize=(8, 6))
# sns.barplot(x="importance", y="feature", data=imp_df.head(20))
# plt.tight_layout()
# plt.savefig("feature_importance.png", dpi=300)

# model.save_model("tip_model_catboost.cbm")
# imp_df.to_csv("feature_importance.csv", index=False)