In [6]:
import gc, random, psutil, pynvml, polars as pl, numpy as np, pandas as pd, optuna, mlflow, mlflow.catboost
from pathlib import Path
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from optuna.integration.mlflow import MLflowCallback

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA_DIR = Path("data")
TARGET   = "tip_amount"
EXPERIMENT = "CatBoost_TimeSeries_Optuna"
TRIALS, SPLITS, MAX_ITERS, EARLY_STOP = 50, 5, 10_000, 100

def log_sys(prefix=""):
    mlflow.log_metric(f"{prefix}cpu_pct", psutil.cpu_percent())
    mlflow.log_metric(f"{prefix}mem_pct", psutil.virtual_memory().percent)
    try:
        pynvml.nvmlInit()
        h = pynvml.nvmlDeviceGetHandleByIndex(0)
        u = pynvml.nvmlDeviceGetUtilizationRates(h)
        m = pynvml.nvmlDeviceGetMemoryInfo(h)
        mlflow.log_metric(f"{prefix}gpu_util_pct", u.gpu)
        mlflow.log_metric(f"{prefix}gpu_mem_used_mb", m.used / 2**20)
        pynvml.nvmlShutdown()
    except Exception:
        pass

def gpu_info():
    try:
        pynvml.nvmlInit()
        h  = pynvml.nvmlDeviceGetHandleByIndex(0)
        nm = pynvml.nvmlDeviceGetName(h)
        name = nm.decode() if isinstance(nm, (bytes, bytearray)) else str(nm)
        mem = round(pynvml.nvmlDeviceGetMemoryInfo(h).total / 2**30, 2)
        pynvml.nvmlShutdown()
        return {"gpu_name": name, "gpu_mem_total_gb": mem}
    except Exception:
        return {"gpu_name": "NA", "gpu_mem_total_gb": 0}

def prep(f):
    df = pl.read_parquet(f, low_memory=True)
    for c in ("tpep_pickup_datetime", "pickup_datetime", "tpep_dropoff_datetime", "dropoff_datetime"):
        if c in df.columns:
            df = df.with_columns(pl.col(c).cast(pl.Datetime("ns")))
    df = df.filter(pl.col(TARGET) >= 0)
    ns = 60_000_000_000
    pick = next((c for c in ("tpep_pickup_datetime", "pickup_datetime") if c in df.columns), None)
    drop = next((c for c in ("tpep_dropoff_datetime", "dropoff_datetime") if c in df.columns), None)
    if pick and drop:
        df = (
            df.with_columns(((pl.col(drop).cast(pl.Int64) - pl.col(pick).cast(pl.Int64)) / ns).cast(pl.Float32).alias("trip_duration_min"))
            .with_columns([
                pl.col(pick).dt.month().cast(pl.Int8).alias("pickup_month"),
                pl.col(pick).dt.day().cast(pl.Int8).alias("pickup_day"),
                pl.col(pick).dt.hour().cast(pl.Int8).alias("pickup_hour"),
                pl.col(pick).dt.weekday().cast(pl.Int8).alias("pickup_dow"),
            ])
            .drop([pick, drop])
        )
    else:
        df = df.with_columns([
            pl.lit(0).cast(pl.Float32).alias("trip_duration_min"),
            pl.lit(0).cast(pl.Int8).alias("pickup_month"),
            pl.lit(0).cast(pl.Int8).alias("pickup_day"),
            pl.lit(0).cast(pl.Int8).alias("pickup_hour"),
            pl.lit(0).cast(pl.Int8).alias("pickup_dow"),
        ])
    for c, t in {"cbd_congestion_fee": pl.Float32, "airport_fee": pl.Float32, "congestion_surcharge": pl.Float32}.items():
        if c not in df.columns:
            df = df.with_columns(pl.lit(0).cast(t).alias(c))
    int_cats = ["VendorID","RatecodeID","PULocationID","DOLocationID","payment_type",
                "pickup_month","pickup_day","pickup_hour","pickup_dow"]
    str_cats = ["store_and_fwd_flag"]
    for c in int_cats:
        if c not in df.columns:
            df = df.with_columns(pl.lit(-1).cast(pl.Int32).alias(c))
        else:
            df = df.with_columns(pl.col(c).cast(pl.Int32).fill_null(-1))
    for c in str_cats:
        if c not in df.columns:
            df = df.with_columns(pl.lit("missing").cast(pl.Utf8).alias(c))
        df = df.with_columns(pl.col(c).fill_null("missing").cast(pl.Categorical))
    return df

ddf = pl.concat([prep(f) for f in sorted(DATA_DIR.glob("*.parquet"))])
pdf = ddf.to_pandas(use_pyarrow_extension_array=True); del ddf; gc.collect()

y = pdf[TARGET]
X = pdf.drop(columns=[TARGET])

cat_cols = ["VendorID","RatecodeID","PULocationID","DOLocationID","payment_type",
            "pickup_month","pickup_day","pickup_hour","pickup_dow","store_and_fwd_flag"]

for c in cat_cols:
    X[c] = X[c].astype("string").fillna("missing")

num_cols = X.columns.difference(cat_cols)
X[num_cols] = X[num_cols].fillna(0).astype("float32")

tscv = TimeSeriesSplit(n_splits=SPLITS)
mlflow.set_experiment(EXPERIMENT)

system = {"cpu_cores": psutil.cpu_count(logical=True),
          "mem_total_gb": round(psutil.virtual_memory().total / 2**30, 2)}
system.update(gpu_info())

mlcb = MLflowCallback(metric_name="val_rmse", create_experiment=False, mlflow_kwargs={"nested": True})

@mlcb.track_in_mlflow()
def objective(trial):
    p = {
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "bootstrap_type":"Bernoulli",
        "iterations": MAX_ITERS,
        "early_stopping_rounds": EARLY_STOP,
        "eval_metric": "RMSE",
        "random_seed": SEED,
        "task_type": "GPU",
        "verbose": 0,
        "cat_features": cat_cols,
    }
    scores, iters = [], []
    for tr, vl in tscv.split(X):
        m = CatBoostRegressor(**p)
        m.fit(X.iloc[tr], y.iloc[tr], eval_set=(X.iloc[vl], y.iloc[vl]), verbose=False)
        scores.append(mean_squared_error(y.iloc[vl], m.predict(X.iloc[vl])))
        iters.append(m.get_best_iteration())
        del m; gc.collect()
    rmse = float(np.mean(scores))
    trial.set_user_attr("best_iterations", int(np.mean(iters)))
    mlflow.log_metric("rmse_cv", rmse)
    mlflow.log_metric("best_iterations", trial.user_attrs["best_iterations"])
    log_sys()
    return rmse

with mlflow.start_run(run_name="optuna_catboost"):
    mlflow.log_params(system)
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=TRIALS, callbacks=[mlcb])
    best_p    = study.best_trial.params
    final_it  = study.best_trial.user_attrs["best_iterations"]
    final_p   = {**best_p, "iterations": final_it, "random_seed": SEED,
                 "task_type": "GPU", "verbose": 0, "cat_features": cat_cols}
    model = CatBoostRegressor(**final_p)
    model.fit(X, y, verbose=False)
    mlflow.log_params(final_p)
    mlflow.catboost.log_model(model, "model")
    log_sys("final_")

print(f"Best CV RMSE: {study.best_value:.6f}")

  mlcb = MLflowCallback(metric_name="val_rmse", create_experiment=False, mlflow_kwargs={"nested": True})
  @mlcb.track_in_mlflow()
[I 2025-05-26 16:21:33,186] A new study created in memory with name: no-name-d90df6a9-ece4-4d96-91d7-49acb2a462e6
[W 2025-05-26 16:23:00,918] Trial 0 failed with parameters: {'depth': 6, 'learning_rate': 0.22648248189516848, 'l2_leaf_reg': 0.8471801418819978, 'subsample': 0.7993292420985183, 'min_data_in_leaf': 16} because of the following error: CatBoostError('catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 100: no CUDA-capable device is detected').
Traceback (most recent call last):
  File "/home/vatereal/.cache/pypoetry/virtualenvs/pad-final-project-uFfiJNJD-py3.12/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/home/vatereal/.cache/pypoetry/virtualenvs/pad-final-project-uFfiJNJD-py3.12/lib/python3.12/site-packages/optuna_integration/mlflow/ml

CatBoostError: catboost/cuda/cuda_lib/cuda_base.h:281: CUDA error 100: no CUDA-capable device is detected

In [4]:
import gc, random, psutil, pynvml, polars as pl, numpy as np, pandas as pd, optuna, mlflow, mlflow.catboost
from pathlib import Path
from catboost import CatBoostRegressor
from catboost.utils import get_gpu_device_count
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from optuna.integration.mlflow import MLflowCallback

SEED = 42
random.seed(SEED); np.random.seed(SEED)

DATA_DIR = Path("data_sampled")
TARGET = "tip_amount"
EXPERIMENT = "CatBoost_TimeSeries_Optuna"
TRIALS, SPLITS, MAX_ITERS, EARLY_STOP = 1, 5, 10000, 100

try: gpu_count = get_gpu_device_count()
except: gpu_count = 0
TASK_TYPE, DEVICES = ("GPU", "0") if gpu_count else ("CPU", None)

def log_sys(p=""):
    mlflow.log_metric(f"{p}cpu_pct", psutil.cpu_percent())
    mlflow.log_metric(f"{p}mem_pct", psutil.virtual_memory().percent)
    try:
        pynvml.nvmlInit(); h = pynvml.nvmlDeviceGetHandleByIndex(0)
        u = pynvml.nvmlDeviceGetUtilizationRates(h); m = pynvml.nvmlDeviceGetMemoryInfo(h)
        mlflow.log_metric(f"{p}gpu_util_pct", u.gpu)
        mlflow.log_metric(f"{p}gpu_mem_used_mb", m.used/2**20); pynvml.nvmlShutdown()
    except: pass

def prep(f: Path) -> pl.DataFrame:
    df = pl.read_parquet(f, low_memory=True)
    for c in ("tpep_pickup_datetime","pickup_datetime","tpep_dropoff_datetime","dropoff_datetime"):
        if c in df.columns: df = df.with_columns(pl.col(c).cast(pl.Datetime("ns")))
    df = df.filter(pl.col(TARGET) >= 0)
    ns = 60000000000
    pick = next((c for c in ("tpep_pickup_datetime","pickup_datetime") if c in df.columns), None)
    drop = next((c for c in ("tpep_dropoff_datetime","dropoff_datetime") if c in df.columns), None)
    if pick and drop:
        df = (df.with_columns(((pl.col(drop).cast(pl.Int64)-pl.col(pick).cast(pl.Int64))/ns)
                              .cast(pl.Float32).alias("trip_duration_min"))
                .with_columns([
                    pl.col(pick).dt.month().cast(pl.Int8).alias("pickup_month"),
                    pl.col(pick).dt.day().cast(pl.Int8).alias("pickup_day"),
                    pl.col(pick).dt.hour().cast(pl.Int8).alias("pickup_hour"),
                    pl.col(pick).dt.weekday().cast(pl.Int8).alias("pickup_dow")])
                .drop([pick, drop]))
    else:
        df = df.with_columns([
            pl.lit(0).cast(pl.Float32).alias("trip_duration_min"),
            pl.lit(0).cast(pl.Int8).alias("pickup_month"),
            pl.lit(0).cast(pl.Int8).alias("pickup_day"),
            pl.lit(0).cast(pl.Int8).alias("pickup_hour"),
            pl.lit(0).cast(pl.Int8).alias("pickup_dow")])
    for c,t in {"cbd_congestion_fee":pl.Float32,"airport_fee":pl.Float32,"congestion_surcharge":pl.Float32}.items():
        if c not in df.columns: df = df.with_columns(pl.lit(0).cast(t).alias(c))
    int_cats = ["VendorID","RatecodeID","PULocationID","DOLocationID","payment_type",
                "pickup_month","pickup_day","pickup_hour","pickup_dow"]
    for c in int_cats:
        df = df.with_columns((pl.col(c).fill_null(-1) if c in df.columns else pl.lit(-1)).cast(pl.Int32).alias(c))
    if "store_and_fwd_flag" not in df.columns:
        df = df.with_columns(pl.lit("missing").cast(pl.Utf8).alias("store_and_fwd_flag"))
    df = df.with_columns(pl.col("store_and_fwd_flag").fill_null("missing").cast(pl.Categorical))
    return df

ddf = pl.concat([prep(f) for f in sorted(DATA_DIR.glob("*.parquet"))])
pdf = ddf.to_pandas(use_pyarrow_extension_array=True); del ddf; gc.collect()

y = pdf[TARGET]; X = pdf.drop(columns=[TARGET])
cat_cols = ["VendorID","RatecodeID","PULocationID","DOLocationID","payment_type",
            "pickup_month","pickup_day","pickup_hour","pickup_dow","store_and_fwd_flag"]
for c in cat_cols: X[c] = X[c].astype("string").fillna("missing")
num_cols = X.columns.difference(cat_cols); X[num_cols] = X[num_cols].fillna(0).astype("float32")

tscv = TimeSeriesSplit(n_splits=SPLITS)
mlflow.set_experiment(EXPERIMENT)

if mlflow.active_run(): mlflow.end_run()
with mlflow.start_run(run_name="optuna_catboost", log_system_metrics=True):
    mlflow.log_params({"cpu_cores": psutil.cpu_count(logical=True),
                       "mem_total_gb": round(psutil.virtual_memory().total/2**30,2),
                       "task_type": TASK_TYPE, "gpu_count": gpu_count})
    log_sys("startup_")

    mlcb = MLflowCallback(metric_name="val_rmse", create_experiment=False, mlflow_kwargs={"nested": True})
    @mlcb.track_in_mlflow()
    def objective(t):
        p = {"depth": t.suggest_int("depth",4,10),
             "learning_rate": t.suggest_float("learning_rate",1e-3,0.3,log=True),
             "l2_leaf_reg": t.suggest_float("l2_leaf_reg",1e-3,10,log=True),
             "subsample": t.suggest_float("subsample",0.5,1.0),
             "min_data_in_leaf": t.suggest_int("min_data_in_leaf",1,100),
             "bootstrap_type":"Bernoulli",
             "iterations":MAX_ITERS,
             "early_stopping_rounds":EARLY_STOP,
             "eval_metric":"RMSE",
             "random_seed":SEED,
             "task_type":TASK_TYPE,
             "devices":DEVICES,
             "verbose":0,
             "cat_features":cat_cols}
        rms, it = [], []
        for tr,vl in tscv.split(X):
            m = CatBoostRegressor(**p)
            m.fit(X.iloc[tr], y.iloc[tr], eval_set=(X.iloc[vl], y.iloc[vl]), verbose=False)
            rms.append(mean_squared_error(y.iloc[vl], m.predict(X.iloc[vl])))
            it.append(m.get_best_iteration()); del m; gc.collect()
        cv = float(np.mean(rms)); t.set_user_attr("best_iterations", int(np.mean(it)))
        mlflow.log_metric("rmse_cv", cv); mlflow.log_metric("best_iterations", t.user_attrs["best_iterations"]); log_sys("trial_")
        return cv

    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=TRIALS, callbacks=[mlcb])

    best = study.best_trial.params
    final_iter = study.best_trial.user_attrs["best_iterations"]
    final = {**best, "iterations": final_iter, "random_seed": SEED, "task_type": TASK_TYPE, "devices": DEVICES, "verbose": 0, "bootstrap_type":"Bernoulli", "cat_features": cat_cols}
    model = CatBoostRegressor(**final).fit(X, y, verbose=False)
    mlflow.log_params(final); mlflow.catboost.log_model(model, "model")
    if Path("catboost_info").exists(): mlflow.log_artifacts("catboost_info", artifact_path="catboost_info")
    mlflow.log_metric("best_rmse_cv", study.best_value); log_sys("final_")

print(f"Best CV RMSE: {study.best_value:.6f}")

2025/05/26 23:05:26 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  mlcb = MLflowCallback(metric_name="val_rmse", create_experiment=False, mlflow_kwargs={"nested": True})
  @mlcb.track_in_mlflow()
[I 2025-05-26 23:05:26,468] A new study created in memory with name: no-name-304ffaae-5ee9-425d-a30f-94170ecf8bad
[I 2025-05-26 23:24:08,816] Trial 0 finished with value: 1.34083213049935 and parameters: {'depth': 6, 'learning_rate': 0.22648248189516848, 'l2_leaf_reg': 0.8471801418819978, 'subsample': 0.7993292420985183, 'min_data_in_leaf': 16}. Best is trial 0 with value: 1.34083213049935.
2025/05/26 23:32:13 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/05/26 23:32:13 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Best CV RMSE: 1.340832


In [None]:
# plt.figure(figsize=(6, 6))
# sns.scatterplot(x=y_test, y=preds, alpha=0.3)
# plt.xlabel("Actual")
# plt.ylabel("Predicted")
# plt.tight_layout()
# plt.savefig("scatter_actual_vs_pred.png", dpi=300)

# residuals = y_test - preds
# plt.figure(figsize=(6, 4))
# sns.histplot(residuals, bins=100, kde=True)
# plt.xlabel("Residuals")
# plt.tight_layout()
# plt.savefig("residual_hist.png", dpi=300)

# importances = model.get_feature_importance(type="PredictionValuesChange")
# imp_df = pd.DataFrame({"feature": features, "importance": importances}).sort_values("importance", ascending=False)
# plt.figure(figsize=(8, 6))
# sns.barplot(x="importance", y="feature", data=imp_df.head(20))
# plt.tight_layout()
# plt.savefig("feature_importance.png", dpi=300)

# model.save_model("tip_model_catboost.cbm")
# imp_df.to_csv("feature_importance.csv", index=False)