In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

# Sample 10% High Volume Vehicles

In [1]:
! pip install catboost
! pip install bayesian-optimization

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting bayesian-optimization
  Downloading bayesian_optimization-2.0.3-py3-none-any.whl.metadata (9.0 kB)
Collecting colorama<0.5.0,>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-2.0.3-py3-none-any.whl (31 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-2.0.3 colorama-0.4.6


 Stream & sample the FHVHV parquet with Dask

In [1]:
import dask.dataframe as dd

# Lazy‐load the big file
fhv_ddf = dd.read_parquet(
    "/content/drive/MyDrive/fhvhv_trip_data/combined_fhvhv.parquet",
    engine="pyarrow"
)

# 10% random sample (lazy)
fhv_sampled_ddf = fhv_ddf.sample(frac=0.10, random_state=42)

# (Optional) Persist to speed up downstream ops
fhv_sampled_ddf = fhv_sampled_ddf.persist()


In [2]:
fhv_sampled_ddf = fhv_sampled_ddf.rename(columns={'company_name': 'vehicle_type'})

In [3]:
fhv_sampled_ddf = fhv_sampled_ddf[[
    "pickup_date",
    "pickup_time",
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "vehicle_type",
    "total_amount"
]]

 Split each dataset into train/test (80:20)

In [4]:
# Using Dask’s random_split gives two lazy DataFrames
train_fhv_ddf, test_fhv_ddf = fhv_sampled_ddf.random_split([0.8, 0.2], random_state=42)


#20% Yellow and Full Green

Green (smaller; load with Pandas)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load
green_df  = pd.read_parquet("/content/drive/MyDrive/green_trip_data/green_tripdata_processed.parquet")

Tag with 'vehicle_type'

In [6]:
green_df["vehicle_type"]  = "Green Taxi"

In [7]:
green_df = green_df[[
    "pickup_date",
    "pickup_time",
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "vehicle_type",
    "total_amount"
]]

# Split
train_green_df, test_green_df = train_test_split(green_df,  test_size=0.2, random_state=42)


Yellow (20%)

In [8]:
yellow_ddf = dd.read_parquet(
    "/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet",
    engine="pyarrow"
)

# 20% random sample (lazy)
yellow_sampled_ddf = yellow_ddf.sample(frac=0.20, random_state=42)

# (Optional) Persist to speed up downstream ops
yellow_sampled_ddf = yellow_sampled_ddf.persist()

In [10]:
yellow_sampled_ddf["vehicle_type"]  = "Yellow Taxi"

In [11]:
yellow_sampled_ddf = yellow_sampled_ddf[[
    "pickup_date",
    "pickup_time",
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "vehicle_type",
    "total_amount"
]]

train_yellow_ddf, test_yellow_ddf = yellow_sampled_ddf.random_split([0.8, 0.2], random_state=42)


# Concatenate into final train & test, select only the features needed

In [12]:
FEATURES = [
    "pickup_date",
    "pickup_time",           # e.g. "14:23"
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "vehicle_type",          # will add this column for green/yellow
]
TARGET = "total_amount"


Convert all to Dask or Pandas consistently.

In [13]:
train_fhv_df = train_fhv_ddf.compute()
test_fhv_df  = test_fhv_ddf.compute()

train_yellow_df = train_yellow_ddf.compute()
test_yellow_df = test_yellow_ddf.compute()
#train_green_df, test_green_df

Concatenate

In [14]:
# delete unneeded variables from ram

import gc

del fhv_ddf
del fhv_sampled_ddf
del green_df
del yellow_ddf
del yellow_sampled_ddf

gc.collect()


189

In [15]:
%load_ext cudf.pandas
import pandas as pd
import gc
from numba import cuda
import rmm

# --- Step 1: combine FHV + Green into a partial train_df ---
partial_train = pd.concat(
    [train_fhv_df[FEATURES + [TARGET]],
     train_green_df[FEATURES + [TARGET]]],
    ignore_index=True
)

del train_fhv_df, train_green_df
gc.collect()
# reset RMM pool and CUDA context
rmm.reinitialize(pool_allocator=True)
cuda.select_device(0)

<weakproxy at 0x7ff0bc324540 to Device at 0x7ff0502900d0>

In [16]:
# --- Step 2: append Yellow to get the final train_df ---
train_df = pd.concat(
    [partial_train,
     train_yellow_df[FEATURES + [TARGET]]],
    ignore_index=True
)

del partial_train, train_yellow_df
gc.collect()
rmm.reinitialize(pool_allocator=True)
cuda.select_device(0)

<weakproxy at 0x7ff0bc324540 to Device at 0x7ff0502900d0>

In [18]:
# prompt: convert and save train_df to avoid crash
train_df['pickup_date'] = pd.to_datetime(train_df['pickup_date'])
train_df['pickup_time'] = pd.to_datetime(train_df['pickup_time'], format='%H:%M:%S').dt.time

train_df.to_parquet('/content/drive/MyDrive/train_trip_data.parquet', engine='pyarrow')

# Optionally, free up memory after saving
del train_df
gc.collect()
rmm.reinitialize(pool_allocator=True)
cuda.select_device(0)


<weakproxy at 0x7ff0bc324540 to Device at 0x7ff0502900d0>

In [19]:
#Repeat for test set
partial_test = pd.concat(
    [test_fhv_df[FEATURES + [TARGET]],
     test_green_df[FEATURES + [TARGET]]],
    ignore_index=True
)
del test_fhv_df, test_green_df
gc.collect()
rmm.reinitialize(pool_allocator=True)
cuda.select_device(0)

<weakproxy at 0x7ff0bc324540 to Device at 0x7ff0502900d0>

In [20]:
test_df = pd.concat(
    [partial_test,
     test_yellow_df[FEATURES + [TARGET]]],
    ignore_index=True
)
del partial_test, test_yellow_df
gc.collect()
rmm.reinitialize(pool_allocator=True)
cuda.select_device(0)

<weakproxy at 0x7ff0bc324540 to Device at 0x7ff0502900d0>

In [21]:
test_df['pickup_date'] = pd.to_datetime(test_df['pickup_date'])
test_df['pickup_time'] = pd.to_datetime(test_df['pickup_time'], format='%H:%M:%S').dt.time

test_df.to_parquet('/content/drive/MyDrive/test_trip_data.parquet', engine='pyarrow')

# Optionally, free up memory after saving
del test_df
gc.collect()
rmm.reinitialize(pool_allocator=True)
cuda.select_device(0)

<weakproxy at 0x7ff0bc324540 to Device at 0x7ff0502900d0>

Load train_df and test_df back after clearing all memory

In [1]:
# %load_ext cudf.pandas
import pandas as pd

train_df = pd.read_parquet('/content/drive/MyDrive/train_trip_data.parquet', engine='pyarrow')
test_df = pd.read_parquet('/content/drive/MyDrive/test_trip_data.parquet', engine='pyarrow')

In [2]:
train_df = train_df.sample(frac=0.5, random_state=42)

In [3]:
# format as “HH:MM” string, then category
train_df['pickup_time'] = pd.to_datetime(train_df['pickup_time'], format='%H:%M:%S').dt.strftime('%H:%M')      # yields strings like "14:23"
test_df['pickup_time'] = pd.to_datetime(test_df['pickup_time'], format='%H:%M:%S').dt.strftime('%H:%M')

In [4]:
train_df["month"] = train_df["pickup_date"].dt.month  # Extract month
train_df["day"] = train_df["pickup_date"].dt.day  # Extract day

test_df["month"] = test_df["pickup_date"].dt.month
test_df["day"] = test_df["pickup_date"].dt.day

In [5]:
train_df['pickup_date'] = train_df['pickup_date'].dt.strftime('%Y-%m-%d')       # yields strings like "2023-10-26"
test_df['pickup_date'] = test_df['pickup_date'].dt.strftime('%Y-%m-%d')

In [15]:
display(train_df.head(1))
display(test_df.head(1))

Unnamed: 0,pickup_date,pickup_time,pickup_location_id,dropoff_location_id,trip_distance,vehicle_type,total_amount,month,day
74823165,2021-01-16,12:32,40,265,23.97,Uber,64.78,1,16


Unnamed: 0,pickup_date,pickup_time,pickup_location_id,dropoff_location_id,trip_distance,vehicle_type,total_amount,month,day
0,2020-01-10,00:49,77,77,1.502,Lyft,10.2,1,10


Set categorical dtype (for CatBoost):

In [6]:
cat_features = ["month", "day", "pickup_time",
                "pickup_location_id",
                "dropoff_location_id", "vehicle_type"]

for col in cat_features:
    train_df[col] = train_df[col].astype("category")
    test_df[col]  = test_df[col].astype("category")


# Model training with CatBoost (+ Bayesian tuning)

Baseline CatBoostRegressor

In [7]:
FEATURES = [
    "month",
    "day",
    "pickup_time",           # e.g. "14:23"
    "pickup_location_id",
    "dropoff_location_id",
    "trip_distance",
    "vehicle_type",
]
TARGET = "total_amount"

In [8]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(train_df[FEATURES], train_df[TARGET], cat_features=cat_features)
test_pool  = Pool(test_df[FEATURES],  test_df[TARGET],  cat_features=cat_features)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric="RMSE",
    task_type="GPU",
    devices="0",
    random_seed=42,
)

model.fit(
    train_pool,
    eval_set=test_pool,
    early_stopping_rounds=50,
    verbose=50
)


0:	learn: 33.5372532	test: 22.7238602	best: 22.7238602 (0)	total: 1.47s	remaining: 24m 32s
50:	learn: 27.2417070	test: 11.5334301	best: 11.5334301 (50)	total: 1m 15s	remaining: 23m 22s
100:	learn: 27.0850274	test: 11.3402362	best: 11.3402362 (100)	total: 2m 22s	remaining: 21m 12s
150:	learn: 26.2246409	test: 11.2736791	best: 11.2736791 (150)	total: 3m 31s	remaining: 19m 48s
200:	learn: 25.8103522	test: 11.2091487	best: 11.2091487 (200)	total: 4m 40s	remaining: 18m 34s
250:	learn: 25.2181008	test: 11.2108550	best: 11.1820377 (246)	total: 5m 50s	remaining: 17m 25s
bestTest = 11.18203772
bestIteration = 246
Shrink model to first 247 iterations.


<catboost.core.CatBoostRegressor at 0x79184f20fd90>

In [9]:
# prompt: save the catboost model pickle

import pickle

# Save the model to a file
with open('/content/drive/MyDrive/catboost_model.pkl', 'wb') as f:
    pickle.dump(model, f)


Bayesian hyperparameter tuning

In [None]:
import catboost as cb
import numpy as np


X = train_df[FEATURES]
y = train_df[TARGET]

cat_features = ["month", "day", "pickup_time", "pickup_location_id", "dropoff_location_id", "vehicle_type"]

cv_dataset = cb.Pool(data=X,
                  label=y,
                  cat_features=cat_features)

def hyp_cat(depth, bagging_temperature):
    params = {"iterations": 300,
              "learning_rate": 0.05,
              "eval_metric": "R2",
              "loss_function": "RMSE",
              "verbose": False}
    params[ "depth"] = int(round(depth))
    params["bagging_temperature"] = bagging_temperature

    scores = cb.cv(cv_dataset,
                params,
                fold_count=3)
    return np.max(scores['test-R2-mean'])

pds = {'depth': (6, 10),
          'bagging_temperature': (1,5),
          }


# Surrogate model
optimizer = BayesianOptimization(hyp_cat, pds, random_state=100)

# Optimize
optimizer.maximize(init_points=3, n_iter=7)

|   iter    |  target   | baggin... |   depth   |
-------------------------------------------------
Training on fold [0/3]


In [20]:
from bayes_opt import BayesianOptimization
from catboost import cv, Pool

def catboost_cv(depth, learning_rate, l2_leaf_reg, border_count):
    params = {
        "iterations": 1000,
        "depth": int(depth),
        "learning_rate": float(learning_rate),
        "l2_leaf_reg": float(l2_leaf_reg),
        "border_count": int(border_count),
        "task_type": "GPU",
        "eval_metric": "RMSE",
        "random_seed": 42,
        "logging_level": "Silent",
    }
    # Recreate train_pool inside the function
    train_pool = Pool(train_df[FEATURES], train_df[TARGET], cat_features=cat_features)
    cv_data = cv(
        params,
        train_pool, # Pass the newly created train_pool
        fold_count=3,
        early_stopping_rounds=30,
        verbose=False
    )
    # we minimize RMSE, so return negative
    return -cv_data["test-RMSE-mean"].min()

pbounds = {
    "depth": (4, 10),
    "learning_rate": (0.01, 0.3),
    "l2_leaf_reg": (1, 10),
    "border_count": (32, 255)
}

optimizer = BayesianOptimization(
    f=catboost_cv,
    pbounds=pbounds,
    random_state=42,
)
optimizer.maximize(init_points=5, n_iter=25)

best_params = optimizer.max["params"]
print("Best hyperparameters:", best_params)

|   iter    |  target   | border... |   depth   | l2_lea... | learni... |
-------------------------------------------------------------------------


CatBoostError: Can't deepcopy _PoolBase object