In [1]:
!pip install tableauhyperapi



#Green

In [9]:
import pyarrow.parquet as pq

# point this at whichever file you want to inspect
parquet_path = "/content/drive/MyDrive/green_trip_data/green_tripdata_processed.parquet"

'''
Path("/content/drive/MyDrive/fhvhv_trip_data/combined_fhvhv.parquet"),
Path("/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet"),
Path("/content/drive/MyDrive/green_trip_data/green_tripdata_processed.parquet"),
'''
# create a ParquetFile object
pq_file = pq.ParquetFile(parquet_path)

# print the Arrow schema (field names, types, nullability)
print(pq_file.schema_arrow)


pickup_date: date32[day]
pickup_time: time64[us]
dropoff_date: date32[day]
dropoff_time: time64[us]
trip_time: double
pickup_datetime: timestamp[us]
dropoff_datetime: timestamp[us]
pickup_location_id: int64
dropoff_location_id: int64
passenger_count: double
trip_distance: double
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
ehail_fee: null
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
cbd_congestion_fee: double
fare_per_mile: double
RatecodeID: string
__null_dask_index__: int64
-- schema metadata --
pandas: '{"index_columns": ["__null_dask_index__"], "column_indexes": [{"' + 3259


In [21]:
from datetime import date
from pathlib import Path

from tableauhyperapi import HyperProcess, Telemetry, \
    Connection, CreateMode, \
    NULLABLE, NULLABLE, SqlType, TableDefinition, TableName, \
    Inserter, \
    escape_name, escape_string_literal, \
    HyperException


def run_create_hyper_file_from_parquet(
        parquet_file_path: Path,
        table_definition: TableDefinition,
        hyper_database_path: Path):
    """
    An example demonstrating how to load rows from an Apache Parquet file (`parquet_file_path`)
    into a new Hyper file (`hyper_database_path`) using the COPY command. Currently the
    table definition of the data to copy needs to be known and explicitly specified.

    Reading Parquet data is analogous to reading CSV data. For more details, see:
    https://tableau.github.io/hyper-db/docs/guides/hyper_file/insert_csv
    """

    # Start the Hyper process.
    #
    # * Sending telemetry data to Tableau is encouraged when trying out an experimental feature.
    #   To opt out, simply set `telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU` below.
    with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper:

        # Open a connection to the Hyper process. This will also create the new Hyper file.
        # The `CREATE_AND_REPLACE` mode causes the file to be replaced if it
        # already exists.
        with Connection(endpoint=hyper.endpoint,
                        database=hyper_database_path,
                        create_mode=CreateMode.CREATE_AND_REPLACE) as connection:

            # Create the target table.
            connection.catalog.create_table(table_definition=table_definition)

            # Execute a COPY command to insert the data from the Parquet file.
            copy_command = f"COPY {table_definition.table_name} FROM {escape_string_literal(str(parquet_file_path))} WITH (FORMAT PARQUET)"
            print(copy_command)
            count_inserted = connection.execute_command(copy_command)
            print(f"-- {count_inserted} rows have been copied from '{parquet_file_path}' to the table {table_definition.table_name} in '{hyper_database_path}'.")


if __name__ == "__main__":
  try:
    pq_path    = Path("/content/drive/MyDrive/green_trip_data/green_tripdata_processed.parquet")
    hyper_path = pq_path.with_suffix(".hyper")    # → /…/green_tripdata_processed.hyper

    table_definition = TableDefinition(
        table_name=TableName("green_tripdata_processed"),
        columns=[
            TableDefinition.Column("pickup_date",          SqlType.date(),                          NULLABLE),
            TableDefinition.Column("pickup_time",          SqlType.time(),                          NULLABLE),
            TableDefinition.Column("dropoff_date",         SqlType.date(),                          NULLABLE),
            TableDefinition.Column("dropoff_time",         SqlType.time(),                          NULLABLE),
            TableDefinition.Column("trip_time",            SqlType.double(),                        NULLABLE),
            TableDefinition.Column("pickup_location_id",   SqlType.big_int(),                       NULLABLE),
            TableDefinition.Column("dropoff_location_id",  SqlType.big_int(),                       NULLABLE),
            TableDefinition.Column("trip_distance",        SqlType.double(),                        NULLABLE),
            TableDefinition.Column("fare_amount",          SqlType.double(),                        NULLABLE),
            TableDefinition.Column("extra",                SqlType.double(),                        NULLABLE),
            TableDefinition.Column("mta_tax",              SqlType.double(),                        NULLABLE),
            TableDefinition.Column("tip_amount",           SqlType.double(),                        NULLABLE),
            TableDefinition.Column("tolls_amount",         SqlType.double(),                        NULLABLE),
            TableDefinition.Column("improvement_surcharge",SqlType.double(),                        NULLABLE),
            TableDefinition.Column("total_amount",         SqlType.double(),                        NULLABLE),
            TableDefinition.Column("congestion_surcharge", SqlType.double(),                        NULLABLE),
            TableDefinition.Column("cbd_congestion_fee",   SqlType.double(),                        NULLABLE),
            TableDefinition.Column("fare_per_mile",        SqlType.double(),                        NULLABLE),
            TableDefinition.Column("RatecodeID",           SqlType.text(),                          NULLABLE),
        ]
    )

    run_create_hyper_file_from_parquet(pq_path, table_definition, hyper_path)

  except HyperException as ex:
      print(ex)
      exit(1)




COPY "green_tripdata_processed" FROM '/content/drive/MyDrive/green_trip_data/green_tripdata_processed.parquet' WITH (FORMAT PARQUET)
-- 4937066 rows have been copied from '/content/drive/MyDrive/green_trip_data/green_tripdata_processed.parquet' to the table "green_tripdata_processed" in '/content/drive/MyDrive/green_trip_data/green_tripdata_processed.hyper'.


# Yellow

In [31]:
import dask.dataframe as dd
ddf = dd.read_parquet("/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet")
print(ddf.head())  # Ensure readable data

  pickup_date pickup_time dropoff_date dropoff_time  trip_time  \
0  2020-01-01    00:28:15   2020-01-01     00:33:03      288.0   
1  2020-01-01    00:35:39   2020-01-01     00:43:04      445.0   
2  2020-01-01    00:47:41   2020-01-01     00:53:52      371.0   
3  2020-01-01    00:55:23   2020-01-01     01:00:14      291.0   
5  2020-01-01    00:09:44   2020-01-01     00:10:37       53.0   

   passenger_count  trip_distance  pickup_location_id  dropoff_location_id  \
0              1.0           1.20                 238                  239   
1              1.0           1.20                 239                  238   
2              1.0           0.60                 238                  238   
3              1.0           0.80                 238                  151   
5              1.0           0.03                   7                  193   

   fare_amount  ...  mta_tax  tip_amount  tolls_amount  improvement_surcharge  \
0          6.0  ...      0.5        1.47           0.

In [32]:
import gc
del ddf
gc.collect()

150

In [22]:
import pyarrow.parquet as pq

# point this at whichever file you want to inspect
parquet_path = "/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet"
# create a ParquetFile object
pq_file = pq.ParquetFile(parquet_path)

# print the Arrow schema (field names, types, nullability)
print(pq_file.schema_arrow)


pickup_date: string
pickup_time: string
dropoff_date: string
dropoff_time: string
trip_time: double
passenger_count: double
trip_distance: double
pickup_location_id: int64
dropoff_location_id: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
Airport_fee: double
cbd_congestion_fee: double
fare_per_mile: double
RatecodeID: string
index: int64
-- schema metadata --
pandas: '{"index_columns": ["index"], "column_indexes": [{"name": null, "' + 2953


In [25]:
def run_create_hyper_file_from_parquet(
        parquet_file_path: Path,
        table_definition: TableDefinition,
        hyper_database_path: Path):

    with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper:

        with Connection(endpoint=hyper.endpoint,
                        database=hyper_database_path,
                        create_mode=CreateMode.CREATE_AND_REPLACE) as connection:

            # Create the target table.
            connection.catalog.create_table(table_definition=table_definition)

            # Execute a COPY command to insert the data from the Parquet file.
            copy_command = f"COPY {table_definition.table_name} FROM {escape_string_literal(str(parquet_file_path))} WITH (FORMAT PARQUET)"
            print(copy_command)
            count_inserted = connection.execute_command(copy_command)
            print(f"-- {count_inserted} rows have been copied from '{parquet_file_path}' to the table {table_definition.table_name} in '{hyper_database_path}'.")


if __name__ == "__main__":
  try:
    pq_path    = Path("/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet")
    hyper_path = pq_path.with_suffix(".hyper")    # → /…/green_tripdata_processed.hyper

    table_definition = TableDefinition(
        table_name=TableName("combined_yellow"),
        columns=[

            TableDefinition.Column("pickup_date",           SqlType.text(),    NULLABLE),
            TableDefinition.Column("pickup_time",           SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_date",          SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_time",          SqlType.text(),    NULLABLE),

            TableDefinition.Column("trip_time",             SqlType.double(),  NULLABLE),
            TableDefinition.Column("passenger_count",       SqlType.double(),  NULLABLE),
            TableDefinition.Column("trip_distance",         SqlType.double(),  NULLABLE),


            TableDefinition.Column("pickup_location_id",    SqlType.big_int(),  NULLABLE),
            TableDefinition.Column("dropoff_location_id",   SqlType.big_int(),  NULLABLE),

            TableDefinition.Column("fare_amount",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("extra",                 SqlType.double(),  NULLABLE),
            TableDefinition.Column("mta_tax",               SqlType.double(),  NULLABLE),
            TableDefinition.Column("tip_amount",            SqlType.double(),  NULLABLE),
            TableDefinition.Column("tolls_amount",          SqlType.double(),  NULLABLE),
            TableDefinition.Column("improvement_surcharge", SqlType.double(),  NULLABLE),
            TableDefinition.Column("total_amount",          SqlType.double(),  NULLABLE),
            TableDefinition.Column("congestion_surcharge",  SqlType.double(),  NULLABLE),
            TableDefinition.Column("Airport_fee",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("cbd_congestion_fee",    SqlType.double(),  NULLABLE),
            TableDefinition.Column("fare_per_mile",         SqlType.double(),  NULLABLE),

            TableDefinition.Column("RatecodeID",            SqlType.text(),    NULLABLE),
        ]
    )

    run_create_hyper_file_from_parquet(pq_path, table_definition, hyper_path)

  except HyperException as ex:
      print(ex)
      exit(1)




COPY "combined_yellow" FROM '/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet' WITH (FORMAT PARQUET)
current transaction is aborted, commands ignored until end of transaction block
Context: 0xfa6b0e2f


In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask_cudf
from pathlib import Path
from tableauhyperapi import (
    HyperProcess, Telemetry,
    Connection, CreateMode,
    SchemaName, TableName,
    NULLABLE, SqlType, TableDefinition,
    Inserter, HyperException
)

# Cluster
cluster = LocalCUDACluster()
client  = Client(cluster)

# Constants
INT_COLS = ["pickup_location_id", "dropoff_location_id"]
TEXT_COLS = ["pickup_date", "pickup_time", "dropoff_date", "dropoff_time"]

START_PARTITION = 3

def run_create_hyper_file_from_parquet(parquet_file_path: Path,
        table_definition: TableDefinition,
        hyper_database_path: Path,
        npartitions: int = None):

    ddf = dask_cudf.read_parquet(str(parquet_file_path))
    if npartitions:
        ddf = ddf.repartition(npartitions=npartitions)

    for c in INT_COLS:
        ddf[c] = ddf[c].astype('float64')
        mask   = ddf[c].isnull() | ((ddf[c] % 1) == 0)
        ddf    = ddf[mask]
        ddf[c]  = ddf[c].astype('int64')


    for c in TEXT_COLS:
        ddf[c] = ddf[c].astype('str')

    ddf = ddf.persist()

    with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper, \
         Connection(endpoint=hyper.endpoint,
                    database=hyper_database_path,
                    create_mode=CreateMode.CREATE_AND_REPLACE if not hyper_database_path.exists() else CreateMode.NONE) as conn:

        # conn.catalog.create_schema(schema=SchemaName("Extract"))
        # conn.catalog.create_table(table_definition=table_definition)
        total_inserted = total_skipped = 0

        for i, part in enumerate(ddf.to_delayed()):
            if i != START_PARTITION:
                print(f"Skipping partition {i} (already processed)")
                continue

            try:
                gdf = part.compute()
                pdf = gdf.to_pandas()
                rows = list(pdf.itertuples(index=False, name=None))
                with Inserter(conn, table_definition) as inserter:
                    inserter.add_rows(rows)
                    inserter.execute()
                    total_inserted += len(rows)
                    print(f"Inserted partition {i} ({len(rows)} rows)")
            except HyperException as e:
                print(f"Skipped partition {i} due to HyperException: {e}")
                total_skipped += 1

        print(f"\n✅ Finished. Inserted: {total_inserted}, Skipped: {total_skipped}")


if __name__ == "__main__":
    pq_path    = Path("/content/drive/MyDrive/yellow_trip_data/combined_yellow.parquet")
    hyper_path = pq_path.with_suffix(".hyper")

    table_definition = TableDefinition(
        table_name=TableName("Extract", "combined_yellow"),
        columns=[
            TableDefinition.Column("pickup_date",           SqlType.text(),    NULLABLE),
            TableDefinition.Column("pickup_time",           SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_date",          SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_time",          SqlType.text(),    NULLABLE),
            TableDefinition.Column("trip_time",             SqlType.double(),  NULLABLE),
            TableDefinition.Column("passenger_count",       SqlType.double(),  NULLABLE),
            TableDefinition.Column("trip_distance",         SqlType.double(),  NULLABLE),
            TableDefinition.Column("pickup_location_id",    SqlType.big_int(), NULLABLE),
            TableDefinition.Column("dropoff_location_id",   SqlType.big_int(), NULLABLE),
            TableDefinition.Column("fare_amount",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("extra",                 SqlType.double(),  NULLABLE),
            TableDefinition.Column("mta_tax",               SqlType.double(),  NULLABLE),
            TableDefinition.Column("tip_amount",            SqlType.double(),  NULLABLE),
            TableDefinition.Column("tolls_amount",          SqlType.double(),  NULLABLE),
            TableDefinition.Column("improvement_surcharge", SqlType.double(),  NULLABLE),
            TableDefinition.Column("total_amount",          SqlType.double(),  NULLABLE),
            TableDefinition.Column("congestion_surcharge",  SqlType.double(),  NULLABLE),
            TableDefinition.Column("Airport_fee",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("cbd_congestion_fee",    SqlType.double(),  NULLABLE),
            TableDefinition.Column("fare_per_mile",         SqlType.double(),  NULLABLE),
            TableDefinition.Column("RatecodeID",            SqlType.text(),    NULLABLE),
        ]
    )

    run_create_hyper_file_from_parquet(
        pq_path,
        table_definition,
        hyper_path,
        npartitions=8
    )


# FHVHV

In [1]:
import pyarrow.parquet as pq

# point this at whichever file you want to inspect
parquet_path = "/content/drive/MyDrive/fhvhv_trip_data/combined_fhvhv.parquet"

# create a ParquetFile object
pq_file = pq.ParquetFile(parquet_path)

# print the Arrow schema (field names, types, nullability)
print(pq_file.schema)


<pyarrow._parquet.ParquetSchema object at 0x7cbbc3dcfb00>
required group field_id=-1 schema {
  optional int64 field_id=-1 pickup_location_id;
  optional int64 field_id=-1 dropoff_location_id;
  optional double field_id=-1 trip_distance;
  optional int64 field_id=-1 trip_time;
  optional double field_id=-1 base_passenger_fare;
  optional double field_id=-1 tolls;
  optional double field_id=-1 bcf;
  optional double field_id=-1 sales_tax;
  optional double field_id=-1 congestion_surcharge;
  optional double field_id=-1 airport_fee;
  optional double field_id=-1 tips;
  optional double field_id=-1 cbd_congestion_fee;
  optional binary field_id=-1 company_name (String);
  optional double field_id=-1 total_amount;
  optional double field_id=-1 fare_per_mile;
  optional binary field_id=-1 pickup_date (String);
  optional binary field_id=-1 pickup_time (String);
  optional binary field_id=-1 dropoff_date (String);
  optional binary field_id=-1 dropoff_time (String);
  optional int64 field_id

In [2]:
def run_create_hyper_file_from_parquet(
        parquet_file_path: Path,
        table_definition: TableDefinition,
        hyper_database_path: Path):

    with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper:

        with Connection(endpoint=hyper.endpoint,
                        database=hyper_database_path,
                        create_mode=CreateMode.CREATE_AND_REPLACE) as connection:

            # Create the target table.
            connection.catalog.create_table(table_definition=table_definition)

            # Execute a COPY command to insert the data from the Parquet file.
            copy_command = f"COPY {table_definition.table_name} FROM {escape_string_literal(str(parquet_file_path))} WITH (FORMAT PARQUET)"
            print(copy_command)
            count_inserted = connection.execute_command(copy_command)
            print(f"-- {count_inserted} rows have been copied from '{parquet_file_path}' to the table {table_definition.table_name} in '{hyper_database_path}'.")


if __name__ == "__main__":
  try:
    pq_path    = Path("/content/drive/MyDrive/fhvhv_trip_data/combined_fhvhv.parquet")
    hyper_path = pq_path.with_suffix(".hyper")    # → /…/green_tripdata_processed.hyper

    table_definition = TableDefinition(
        table_name=TableName("combined_fhvhv"),
        columns=[

            TableDefinition.Column("pickup_date",           SqlType.text(),    NULLABLE),
            TableDefinition.Column("pickup_time",           SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_date",          SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_time",          SqlType.text(),    NULLABLE),

            TableDefinition.Column("trip_time",             SqlType.big_int(),  NULLABLE),
            TableDefinition.Column("trip_distance",         SqlType.double(),  NULLABLE),


            TableDefinition.Column("pickup_location_id",    SqlType.big_int(),  NULLABLE),
            TableDefinition.Column("dropoff_location_id",   SqlType.big_int(),  NULLABLE),

            TableDefinition.Column("base_passenger_fare",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("bcf",                 SqlType.double(),  NULLABLE),
            TableDefinition.Column("sales_tax",               SqlType.double(),  NULLABLE),
            TableDefinition.Column("tips",            SqlType.double(),  NULLABLE),
            TableDefinition.Column("tolls",          SqlType.double(),  NULLABLE),
            TableDefinition.Column("total_amount",          SqlType.double(),  NULLABLE),
            TableDefinition.Column("congestion_surcharge",  SqlType.double(),  NULLABLE),
            TableDefinition.Column("airport_fee",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("cbd_congestion_fee",    SqlType.double(),  NULLABLE),
            TableDefinition.Column("fare_per_mile",         SqlType.double(),  NULLABLE),
            TableDefinition.Column("company_name",            SqlType.text(),    NULLABLE),
        ]
    )

    run_create_hyper_file_from_parquet(pq_path, table_definition, hyper_path)

  except HyperException as ex:
      print(ex)
      exit(1)


COPY "combined_fhvhv" FROM '/content/drive/MyDrive/fhvhv_trip_data/combined_fhvhv.parquet' WITH (FORMAT PARQUET)
canceled
Context: 0xfa6b0e2f


In [2]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask_cudf
import pandas as pd
from pathlib import Path
from tableauhyperapi import (
    HyperProcess, Telemetry,
    Connection, CreateMode,
    SchemaName, TableName,
    NULLABLE, SqlType, TableDefinition,
    Inserter, HyperException
)

# start one GPU worker per card
cluster = LocalCUDACluster()
client  = Client(cluster)

# your int/text columns
INT_COLS  = ["trip_time", "pickup_location_id", "dropoff_location_id"]
TEXT_COLS = ["pickup_date", "pickup_time", "dropoff_date", "dropoff_time"]

def run_create_hyper_file_from_parquet(
    parquet_file_path: Path,
    table_definition: TableDefinition,
    hyper_database_path: Path,
    npartitions: int = None
):
    # 1) GPU-backed read
    ddf = dask_cudf.read_parquet(str(parquet_file_path))
    if npartitions:
        ddf = ddf.repartition(npartitions=npartitions)

    # 2) GPU-side casts & filters
    for c in INT_COLS:
        # cast to float64 so non-whole get filtered
        ddf[c] = ddf[c].astype("float64")
        mask   = ddf[c].isnull() | ((ddf[c] % 1) == 0)
        ddf     = ddf[mask]
        ddf[c] = ddf[c].astype("int64")

    for c in TEXT_COLS:
        ddf[c] = ddf[c].astype("str")

    # ←— Persist once, so following compute() is cached
    ddf = ddf.persist()

    # 3) Hyper load
    with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU) as hyper, \
         Connection(endpoint=hyper.endpoint,
                    database=hyper_database_path,
                    create_mode=CreateMode.CREATE_AND_REPLACE) as conn:

        conn.catalog.create_schema(schema=SchemaName("Extract"))
        conn.catalog.create_table(table_definition=table_definition)

        total_inserted = total_skipped = 0

        # 4) Stream each partition back to CPU
        for part in ddf.to_delayed():
            # gdf is a GPU DataFrame
            gdf = part.compute()

            # convert to pandas (this moves data over PCIe)
            pdf = gdf.to_pandas()

            # — ensure Python ints for Hyper
            for c in INT_COLS:
                # pdf[c] is pd.Series dtype int64
                # convert to object dtype with Python int or None
                pdf[c] = pdf[c].where(pdf[c].isna(), pdf[c].astype(int))
                pdf[c] = pdf[c].where(pdf[c].isna(), pdf[c])  # now dtype=object


            rows = list(pdf.itertuples(index=False, name=None))

            # 6) Insert
            with Inserter(conn, table_definition) as inserter:
                try:
                    inserter.add_rows(rows)
                    inserter.execute()
                    total_inserted += len(rows)
                    print(f"Inserted {len(rows)} rows (total so far: {total_inserted})")

                except HyperException as e:
                    total_skipped += len(rows)
                    print(f"⚠️ Skipped partition due to error: {e}")
                    conn.rollback()

        print(f"\n✅ Finished. Inserted: {total_inserted}, Skipped: {total_skipped}")


if __name__ == "__main__":
    pq_path    = Path("/content/drive/MyDrive/fhvhv_trip_data/combined_fhvhv.parquet")
    hyper_path = pq_path.with_suffix(".hyper")

    table_definition = TableDefinition(
        table_name=TableName("Extract", "combined_fhvhv"),
        columns=[
            TableDefinition.Column("pickup_location_id",  SqlType.big_int(), NULLABLE),
            TableDefinition.Column("dropoff_location_id", SqlType.big_int(), NULLABLE),
            TableDefinition.Column("trip_distance",       SqlType.double(),  NULLABLE),
            TableDefinition.Column("trip_time",           SqlType.big_int(), NULLABLE),
            TableDefinition.Column("base_passenger_fare", SqlType.double(),  NULLABLE),
            TableDefinition.Column("tolls",               SqlType.double(),  NULLABLE),
            TableDefinition.Column("bcf",                 SqlType.double(),  NULLABLE),
            TableDefinition.Column("sales_tax",           SqlType.double(),  NULLABLE),
            TableDefinition.Column("congestion_surcharge",SqlType.double(),  NULLABLE),
            TableDefinition.Column("airport_fee",         SqlType.double(),  NULLABLE),
            TableDefinition.Column("tips",                SqlType.double(),  NULLABLE),
            TableDefinition.Column("cbd_congestion_fee",  SqlType.double(),  NULLABLE),
            TableDefinition.Column("company_name",        SqlType.text(),    NULLABLE),
            TableDefinition.Column("total_amount",        SqlType.double(),  NULLABLE),
            TableDefinition.Column("fare_per_mile",       SqlType.double(),  NULLABLE),
            TableDefinition.Column("pickup_date",         SqlType.text(),    NULLABLE),
            TableDefinition.Column("pickup_time",         SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_date",        SqlType.text(),    NULLABLE),
            TableDefinition.Column("dropoff_time",        SqlType.text(),    NULLABLE),
        ]
    )

    run_create_hyper_file_from_parquet(
        pq_path,
        table_definition,
        hyper_path,
        npartitions=60  # adjust for ~600 MB chunks on 30 GB
    )


INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.diskutils:Found stale lock file and directory '/tmp/dask-scratch-space/scheduler-rp65wty3', purging
INFO:distributed.diskutils:Found stale lock file and directory '/tmp/dask-scratch-space/scheduler-er2h5rz4', purging
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:37973
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:38461'
INFO:distributed.scheduler:Register worker addr: tcp://127.0.0.1:41173 name: 0
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:41173
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:59720
INFO:distributed.scheduler:Receive client connection: Client-c9

Inserted 34603008 rows (total so far: 34603008)


INFO:distributed.core:Event loop was unresponsive in Scheduler for 4.89s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Nanny for 5.21s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Scheduler for 4.48s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Nanny for 4.17s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Connection to tcp://127.0.0.1:59720 has been closed.
INFO:distributed.scheduler:Remove worker addr: tcp://127.0.0.1:41173 name: 0 (stimulus_id='handle-worker-

⚠️ Skipped partition due to error: current transaction is aborted, commands ignored until end of transaction block
Context: 0xfa6b0e2f


AttributeError: 'Inserter' object has no attribute 'clear'