In [8]:
import argparse
import gc
from pathlib import Path

from catboost import CatBoostError, CatBoostRegressor
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import optuna
import pandas as pd
import pyarrow.dataset as ds
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [9]:
def load_your_data():
    DATA_DIR = Path("data")
    df = (
        ds.dataset(DATA_DIR, format="parquet")
          .to_table()
          .to_pandas(use_threads=True, self_destruct=True)
    )
    gc.collect()

    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    df = df[df["tip_amount"] >= 0]
    df["trip_duration_min"] = (
        (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"])
        .dt.total_seconds() / 60
    )
    df["pickup_month"] = df["tpep_pickup_datetime"].dt.month.astype("int8")
    df["pickup_day"]   = df["tpep_pickup_datetime"].dt.day.astype("int8")
    df["pickup_hour"]  = df["tpep_pickup_datetime"].dt.hour.astype("int8")
    df["pickup_dow"]   = df["tpep_pickup_datetime"].dt.dayofweek.astype("int8")

    for col in ["cbd_congestion_fee", "airport_fee", "congestion_surcharge"]:
        if col not in df.columns:
            df[col] = 0.0

    categorical = [
        "VendorID", "RatecodeID", "store_and_fwd_flag",
        "PULocationID", "DOLocationID", "payment_type",
        "pickup_month", "pickup_day", "pickup_hour", "pickup_dow",
    ]
    string_categorical = ["store_and_fwd_flag"]
    numeric_categorical = [c for c in categorical if c not in string_categorical]

    for col in numeric_categorical:
        df[col] = (
            pd.to_numeric(df[col], errors="coerce")
              .fillna(-1)
              .astype("int32")
        )
    for col in string_categorical:
        df[col] = df[col].fillna("missing").astype("string")

    features = [
        "VendorID", "RatecodeID", "store_and_fwd_flag",
        "PULocationID", "DOLocationID", "payment_type",
        "passenger_count", "trip_distance", "fare_amount",
        "extra", "mta_tax", "tolls_amount", "improvement_surcharge",
        "congestion_surcharge", "airport_fee", "cbd_congestion_fee",
        "total_amount", "trip_duration_min",
        "pickup_month", "pickup_day", "pickup_hour", "pickup_dow",
    ]
    target = "tip_amount"

    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[target], test_size=0.20, random_state=42
    )

    return X_train, y_train, categorical, X_test, y_test

In [10]:
# total_rows = len(df)
# rows = []
# for col in df.columns:
#     miss_pct = df[col].isna().mean() * 100
#     zero_pct = ((df[col] == 0).sum() / total_rows) * 100 if pd.api.types.is_numeric_dtype(df[col]) else np.nan
#     top_freq = df[col].value_counts(dropna=False).iloc[0]
#     mode_pct = (top_freq / total_rows) * 100
#     rows.append({"column": col, "%missing": round(miss_pct, 2),
#                  "%zeros": round(zero_pct, 2) if not np.isnan(zero_pct) else np.nan,
#                  "%mode_share": round(mode_pct, 2)})

# eda_df = pd.DataFrame(rows).sort_values("%missing", ascending=False)
# eda_df

In [12]:
def objective(trial, X, y, categorical):
    params = {
        "loss_function": "RMSE",
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 100, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 0, 1),
        "random_seed": 42,
        "verbose": 0,
        "early_stopping_rounds": 20,
        "task_type": "GPU",
        "devices": "0",
    }
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        tscv = TimeSeriesSplit(n_splits=4)
        rmses = []
        try:
            for train_idx, val_idx in tscv.split(X):
                X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
                model = CatBoostRegressor(**params)
                model.fit(
                    X_tr, y_tr,
                    cat_features=categorical,
                    eval_set=(X_val, y_val),
                )
                preds = model.predict(X_val)
                rmses.append(mean_squared_error(y_val, preds))
        except CatBoostError as e:
            trial.set_user_attr("fail_reason", str(e))
            return float("inf")
        score = float(np.mean(rmses))
        mlflow.log_metric("rmse_cv", score)
        return score

def main(n_trials: int):
    mlflow.set_experiment("Taxi_Duration_Experiment")
    X, y, categorical, X_test, y_test = load_your_data()
    study = optuna.create_study(
        direction="minimize",
        pruner=optuna.pruners.HyperbandPruner(),
    )
    study.optimize(
        lambda t: objective(t, X, y, categorical),
        n_trials=n_trials,
        show_progress_bar=True,
    )
    best_params = study.best_trial.params
    best_params.update(loss_function="RMSE", random_seed=42, verbose=0)
    final = CatBoostRegressor(**best_params)
    final.fit(X, y, cat_features=categorical, eval_set=(X_test, y_test), verbose=False)
    preds = final.predict(X_test)
    rmse_test = mean_squared_error(y_test, preds)
    mlflow.log_metric("rmse_test", rmse_test)
    print(f"Final test RMSE: {rmse_test:.4f}")

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--n-trials", type=int, default=1)
    args, _ = parser.parse_known_args()
    main(args.n_trials)

[I 2025-05-20 18:07:33,214] A new study created in memory with name: no-name-1ed363de-1343-4208-ae89-7358201064c6


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-05-20 19:21:37,848] Trial 0 finished with value: inf and parameters: {'iterations': 1258, 'depth': 4, 'learning_rate': 0.1977286300157578, 'l2_leaf_reg': 2.3062642583309003, 'bagging_temperature': 0.4346096140968354, 'random_strength': 0.31535489117217486}. Best is trial 0 with value: inf.


In [None]:
# plt.figure(figsize=(6, 6))
# sns.scatterplot(x=y_test, y=preds, alpha=0.3)
# plt.xlabel("Actual")
# plt.ylabel("Predicted")
# plt.tight_layout()
# plt.savefig("scatter_actual_vs_pred.png", dpi=300)

# residuals = y_test - preds
# plt.figure(figsize=(6, 4))
# sns.histplot(residuals, bins=100, kde=True)
# plt.xlabel("Residuals")
# plt.tight_layout()
# plt.savefig("residual_hist.png", dpi=300)

# importances = model.get_feature_importance(type="PredictionValuesChange")
# imp_df = pd.DataFrame({"feature": features, "importance": importances}).sort_values("importance", ascending=False)
# plt.figure(figsize=(8, 6))
# sns.barplot(x="importance", y="feature", data=imp_df.head(20))
# plt.tight_layout()
# plt.savefig("feature_importance.png", dpi=300)

# model.save_model("tip_model_catboost.cbm")
# imp_df.to_csv("feature_importance.csv", index=False)