In [8]:
from pathlib import Path
import gc, numpy as np, pandas as pd, pyarrow.dataset as ds
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, CatBoostError
import matplotlib.pyplot as plt
import optuna

In [9]:
DATA_DIR = Path("data")

df = (
    ds.dataset(DATA_DIR, format="parquet")
      .to_table()
      .to_pandas(use_threads=True, self_destruct=True)
)
print(df.memory_usage(deep=True).sum() / 1e9, "GB in RAM")
gc.collect()

7.38651714 GB in RAM


231

In [10]:
df["tpep_pickup_datetime"]  = pd.to_datetime(df["tpep_pickup_datetime"])
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

df = df[df["tip_amount"] >= 0]

df["trip_duration_min"] = (
    (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
)
df["pickup_month"] = df["tpep_pickup_datetime"].dt.month.astype("int8")
df["pickup_day"]   = df["tpep_pickup_datetime"].dt.day.astype("int8")
df["pickup_hour"]  = df["tpep_pickup_datetime"].dt.hour.astype("int8")
df["pickup_dow"]   = df["tpep_pickup_datetime"].dt.dayofweek.astype("int8")

for col in ["cbd_congestion_fee", "airport_fee", "congestion_surcharge"]:
    if col not in df.columns:
        df[col] = 0.0

In [11]:
categorical = [
    "VendorID", "RatecodeID", "store_and_fwd_flag",
    "PULocationID", "DOLocationID", "payment_type",
    "pickup_month", "pickup_day", "pickup_hour", "pickup_dow",
]

string_categorical  = ["store_and_fwd_flag"]
numeric_categorical = [c for c in categorical if c not in string_categorical]

for col in numeric_categorical:
    df[col] = (
        pd.to_numeric(df[col], errors="coerce")
          .fillna(-1)
          .astype("int32")
    )

for col in string_categorical:
    df[col] = df[col].fillna("missing").astype("string")

print(df.memory_usage(deep=True).sum() / 1e9, "GB in RAM")
gc.collect()

9.163929744 GB in RAM


33

In [12]:
features = [
    "VendorID", "RatecodeID", "store_and_fwd_flag",
    "PULocationID", "DOLocationID", "payment_type",
    "passenger_count", "trip_distance", "fare_amount",
    "extra", "mta_tax", "tolls_amount", "improvement_surcharge",
    "congestion_surcharge", "airport_fee", "cbd_congestion_fee",
    "total_amount", "trip_duration_min",
    "pickup_month", "pickup_day", "pickup_hour", "pickup_dow",
]
target = "tip_amount"

X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[target], test_size=0.20, random_state=42)

In [13]:
total_rows = len(df)
rows = []
for col in df.columns:
    miss_pct = df[col].isna().mean() * 100
    zero_pct = ((df[col] == 0).sum() / total_rows) * 100 if pd.api.types.is_numeric_dtype(df[col]) else np.nan
    top_freq = df[col].value_counts(dropna=False).iloc[0]
    mode_pct = (top_freq / total_rows) * 100
    rows.append({"column": col, "%missing": round(miss_pct, 2),
                 "%zeros": round(zero_pct, 2) if not np.isnan(zero_pct) else np.nan,
                 "%mode_share": round(mode_pct, 2)})

eda_df = pd.DataFrame(rows).sort_values("%missing", ascending=False)
eda_df

Unnamed: 0,column,%missing,%zeros,%mode_share
3,passenger_count,9.94,0.97,69.55
17,congestion_surcharge,9.94,7.24,81.62
18,Airport_fee,9.94,82.01,82.01
2,tpep_dropoff_datetime,0.0,,0.0
1,tpep_pickup_datetime,0.0,,0.0
5,RatecodeID,0.0,0.0,84.17
6,store_and_fwd_flag,0.0,,89.64
7,PULocationID,0.0,0.0,4.83
4,trip_distance,0.0,1.88,1.88
0,VendorID,0.0,0.0,76.39


In [14]:
del eda_df, df

In [None]:
tscv = TimeSeriesSplit(n_splits=4)

def objective(trial):
    params = {
        "loss_function": "RMSE",
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 100, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 0, 1),
        "random_seed": 42,
        "verbose": 0,
        "early_stopping_rounds": 20,
        "task_type": "GPU",
        "devices": "0",
    }
    rmses = []
    try:
        for train_idx, val_idx in tscv.split(X_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            model = CatBoostRegressor(**params)
            model.fit(
                X_tr, y_tr,
                cat_features=categorical,
                eval_set=(X_val, y_val)
            )
            preds = model.predict(X_val)
            rmses.append(mean_squared_error(y_val, preds))
        return np.mean(rmses)
    except CatBoostError as e:
        trial.set_user_attr("fail_reason", str(e))
        return float("inf")

study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.HyperbandPruner()
)
study.optimize(objective, n_trials=1, show_progress_bar=True)

[I 2025-05-20 17:24:15,965] A new study created in memory with name: no-name-0ceed55f-8814-4224-95d8-d61f69c08a07


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
best_params = study.best_trial.params
best_params.update({"loss_function": "RMSE", "random_seed": 42, "verbose": 0})

model = CatBoostRegressor(**best_params)
model.fit(X_train, y_train, cat_features=categorical, eval_set=(X_test, y_test))

preds = model.predict(X_test)
rmse_test = mean_squared_error(y_test, preds)
print(f"RMSE on hold-out test set: {rmse_test:.4f}")

In [None]:
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=preds, alpha=0.3)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.tight_layout()
plt.savefig("scatter_actual_vs_pred.png", dpi=300)

residuals = y_test - preds
plt.figure(figsize=(6, 4))
sns.histplot(residuals, bins=100, kde=True)
plt.xlabel("Residuals")
plt.tight_layout()
plt.savefig("residual_hist.png", dpi=300)

importances = model.get_feature_importance(type="PredictionValuesChange")
imp_df = pd.DataFrame({"feature": features, "importance": importances}).sort_values("importance", ascending=False)
plt.figure(figsize=(8, 6))
sns.barplot(x="importance", y="feature", data=imp_df.head(20))
plt.tight_layout()
plt.savefig("feature_importance.png", dpi=300)

model.save_model("tip_model_catboost.cbm")
imp_df.to_csv("feature_importance.csv", index=False)