In [9]:
from harpy import (
    Session,
    MapTask,
    ReduceTask,
    TransformTask,
    Result,
    TaskSetResults
)

session = Session().create_session()

In [10]:
silver_sql = """
    SELECT 
        strptime(CONCAT("CRASH DATE", ' - ', "CRASH TIME"), '%m/%d/%Y - %H:%M') as crash_datetime,
        "BOROUGH" as borough,
        "ZIP CODE" as zip_code,
        "LATITUDE" as latitude,
        "LONGITUDE" as longitude,
        "LOCATION" as location,
        "ON STREET NAME" as on_street_name,
        "CROSS STREET NAME" as cross_street_name,
        "OFF STREET NAME" as off_street_name,
        "NUMBER OF PERSONS INJURED" as number_of_persons_injured,
        "NUMBER OF PERSONS KILLED" as number_of_persons_killed,
        "NUMBER OF PEDESTRIANS INJURED" as number_of_pedestrians_injured,
        "NUMBER OF PEDESTRIANS KILLED" as number_of_pedestrians_killed,
        "NUMBER OF CYCLIST INJURED" as number_of_cyclist_injured,
        "NUMBER OF CYCLIST KILLED" as number_of_cyclist_killed,
        "NUMBER OF MOTORIST INJURED" as number_of_motorist_injured,
        "NUMBER OF MOTORIST KILLED" as number_of_motorist_killed,
        "CONTRIBUTING FACTOR VEHICLE 1" as contributing_factor_vehicle_1,
        "CONTRIBUTING FACTOR VEHICLE 2" as contributing_factor_vehicle_2,
        "CONTRIBUTING FACTOR VEHICLE 3" as contributing_factor_vehicle_3,
        "CONTRIBUTING FACTOR VEHICLE 4" as contributing_factor_vehicle_4,
        "CONTRIBUTING FACTOR VEHICLE 5" as contributing_factor_vehicle_5,
        "COLLISION_ID" as collision_id,
        "VEHICLE TYPE CODE 1" as vehicle_type_code_1,
        "VEHICLE TYPE CODE 2" as vehicle_type_code_2,
        "VEHICLE TYPE CODE 3" as vehicle_type_code_3,
        "VEHICLE TYPE CODE 4" as vehicle_type_code_4,
        "VEHICLE TYPE CODE 5" as vehicle_type_code_5
    FROM read_parquet('/Volumes/data/motor_colisions/*.parquet')
    --FROM read_parquet('../_example_data/motor_colisions/*.parquet')
"""

In [14]:
# Delta tables 
from deltalake import write_deltalake, DeltaTable
import pyarrow as pa
from harpy.quack import QuackContext

def fetch_arrow_sql(sql:str) -> pa.Table:
    with QuackContext() as ctx:
        q = ctx.sql(sql)
        return q.to_arrow()
    

def write_deltalake_from_pa(df:pa.Table, path: str, mode:str) -> None:
    write_deltalake(path, df, mode=mode)

ts = session.create_task_set()
ts.add_maps([MapTask(name="fetch-pa-table", fun=fetch_arrow_sql, args=[], kwargs={'sql': silver_sql})])
ts.add_transform(TransformTask(name="write-pa-table", fun=write_deltalake_from_pa, args=[], kwargs={'path': 'silver', 'mode': 'overwrite'}))

result = ts.execute()

TaskSetDefinitionError: InvalidTransformFunction: 
 - Transform function must have typed input and output

In [2]:
session.fs.ls("/Volumes/data/")

TaskSet ts-b5a9ce52-78b4-4075-a705-454e3409655a: running
TaskGroup tg-14f7daf4-a716-4b73-84f3-7e698aee44c3 made progress
TaskGroup tg-14f7daf4-a716-4b73-84f3-7e698aee44c3 made progress
Task tg-14f7daf4-a716-4b73-84f3-7e698aee44c3-tr-0: queued
TaskGroup tg-14f7daf4-a716-4b73-84f3-7e698aee44c3 made progress
Task tg-14f7daf4-a716-4b73-84f3-7e698aee44c3-tr-0: running
Task tg-14f7daf4-a716-4b73-84f3-7e698aee44c3-tr-0: done
Task tg-14f7daf4-a716-4b73-84f3-7e698aee44c3-tr-0: fetching
TaskGroup tg-14f7daf4-a716-4b73-84f3-7e698aee44c3 made progress
TaskSet ts-b5a9ce52-78b4-4075-a705-454e3409655a: compleated
Getting results


FileSystemException: FileSystemException
Traceback (most recent call last):
  File "/home/ghhwer/project-quack/harpy/remote-runner/./py_src/isolated-session-585b419a-6c78-4af3-afa4-c53c9a7807aa/main.py", line 47, in <module>
    main()
  File "/home/ghhwer/project-quack/harpy/remote-runner/./py_src/isolated-session-585b419a-6c78-4af3-afa4-c53c9a7807aa/main.py", line 40, in main
    return_object = unpickled_func(*args, **kwargs)
  File "/home/ghhwer/project-quack/harpy/remote-runner/py_src/venv/lib/python3.10/site-packages/harpy/session/FileSystem.py", line 11, in fs_ls
    files = os.listdir(path)
FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/data/'


In [14]:
from deltalake import DeltaTable, write_deltalake
from harpy.quack import QuackContext
import pandas as pd
import pyarrow as pa

# Repartition data into N partitions
N = 10
df_count = session.sql("SELECT COUNT(*) as count FROM read_csv('/Volumes/data/Motor_Vehicle_Collisions_-_Crashes.csv')")
total_count = df_count.iloc[0]['count']
partition_size = total_count // N
remainder = total_count % N

def repart_map(location: str, partition_size: int, index: int, remainder: int) -> None:
    offset = partition_size * index
    limit = partition_size + (1 if index < remainder else 0)
    with QuackContext() as q:
        q.sql("""
            COPY (
                SELECT * FROM read_csv('{0}', ALL_VARCHAR=True) LIMIT {1} OFFSET {2}
            ) TO '/Volumes/data/motor_colisions/file_{3}.parquet' (FORMAT PARQUET, ROW_GROUP_SIZE 1024, COMPRESSION SNAPPY)
        """.format(location, limit, offset, index))

ts = session.create_task_set()
ts.add_maps([MapTask("split", repart_map, args=[], kwargs={'location': '/Volumes/data/Motor_Vehicle_Collisions_-_Crashes.csv', 'partition_size': partition_size, 'index': i, 'remainder': remainder}) for i in range(N)])

results = ts.execute()

# Verify the output
output_counts = [session.sql("SELECT COUNT(*) as count FROM read_parquet('/Volumes/data/motor_colisions/file_{0}.parquet')".format(i)).iloc[0]['count'] for i in range(N)]
total_output_count = sum(output_counts)

print(f"Total input count: {total_count}")
print(f"Total output count: {total_output_count}")

TaskSet ts-a07cdcaf-3a8a-4a88-9ff3-f79a0cce26fc: running
TaskGroup tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9 made progress
TaskGroup tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9 made progress
Task tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9-tr-0: queued
TaskGroup tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9 made progress
Task tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9-tr-0: running
Task tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9-tr-0: done
TaskGroup tg-8a6cc4d8-7b5b-4513-af8e-c68b3647eea9 made progress
TaskSet ts-a07cdcaf-3a8a-4a88-9ff3-f79a0cce26fc: compleated
Getting results
TaskSet ts-fb614a56-86b4-4513-b0b9-d3b2f4c8051f: running
TaskGroup tg-d7b7909f-c58e-474e-be33-c29cc5437c5d made progress
TaskGroup tg-d7b7909f-c58e-474e-be33-c29cc5437c5d made progress
Task tg-d7b7909f-c58e-474e-be33-c29cc5437c5d-tr-7: queued
Task tg-d7b7909f-c58e-474e-be33-c29cc5437c5d-tr-8: queued
Task tg-d7b7909f-c58e-474e-be33-c29cc5437c5d-tr-0: queued
Task tg-d7b7909f-c58e-474e-be33-c29cc5437c5d-tr-1: queued
Task tg-d7b7909f-c58e

In [25]:
session.sql("SELECT * FROM read_parquet('/Volumes/data/motor_colisions/file_0.parquet')").head()

TaskSet ts-e1eebf97-60b0-4325-8fa6-dcead612382f: running
TaskGroup tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a made progress
TaskGroup tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a made progress
Task tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a-tr-0: queued
TaskGroup tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a made progress
Task tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a-tr-0: running
Task tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a-tr-0: done
TaskGroup tg-242ba8e6-7ece-4e7a-9fc8-560c8914169a made progress
TaskSet ts-e1eebf97-60b0-4325-8fa6-dcead612382f: compleated
Getting results


Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,09/11/2021,2:39,,,,,,WHITESTONE EXPRESSWAY,20 AVENUE,,...,Unspecified,,,,4455765,Sedan,Sedan,,,
1,03/26/2022,11:45,,,,,,QUEENSBORO BRIDGE UPPER,,,...,,,,,4513547,Sedan,,,,
2,06/29/2022,6:55,,,,,,THROGS NECK BRIDGE,,,...,Unspecified,,,,4541903,Sedan,Pick-up Truck,,,
3,09/11/2021,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",,,1211 LORING AVENUE,...,,,,,4456314,Sedan,,,,
4,12/14/2021,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",SARATOGA AVENUE,DECATUR STREET,,...,,,,,4486609,,,,,


In [None]:
session.sql("SELECT * FROM read_parquet('/Volumes/data/motor_colisions/file_1.parquet') LIM").head()

TaskSet ts-b605de98-20d9-40be-84b0-893c2ac40709: running
TaskGroup tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7 made progress
TaskGroup tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7 made progress
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: queued
TaskGroup tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7 made progress
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: running
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: done
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: fetching
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: fetching
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: fetching
Task tg-128d49ab-4fcc-409e-b2e9-c6a9dea900d7-tr-0: fetching


_MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.DEADLINE_EXCEEDED
	details = "context deadline exceeded"
	debug_error_string = "UNKNOWN:Error received from peer ipv6:%5B::1%5D:50051 {grpc_message:"context deadline exceeded", grpc_status:4, created_time:"2024-10-15T16:37:15.986066226+00:00"}"
>

In [22]:
sql_exp = """
SELECT 
    strptime(CONCAT("CRASH DATE", ' - ', "CRASH TIME"), '%m/%d/%Y - %H:%M') as crash_datetime,
    "BOROUGH" as borough,
    "ZIP CODE" as zip_code,
    "LATITUDE" as latitude,
    "LONGITUDE" as longitude,
    "LOCATION" as location,
    "ON STREET NAME" as on_street_name,
    "CROSS STREET NAME" as cross_street_name,
    "OFF STREET NAME" as off_street_name,
    "NUMBER OF PERSONS INJURED" as number_of_persons_injured,
    "NUMBER OF PERSONS KILLED" as number_of_persons_killed,
    "NUMBER OF PEDESTRIANS INJURED" as number_of_pedestrians_injured,
    "NUMBER OF PEDESTRIANS KILLED" as number_of_pedestrians_killed,
    "NUMBER OF CYCLIST INJURED" as number_of_cyclist_injured,
    "NUMBER OF CYCLIST KILLED" as number_of_cyclist_killed,
    "NUMBER OF MOTORIST INJURED" as number_of_motorist_injured,
    "NUMBER OF MOTORIST KILLED" as number_of_motorist_killed,
    "CONTRIBUTING FACTOR VEHICLE 1" as contributing_factor_vehicle_1,
    "CONTRIBUTING FACTOR VEHICLE 2" as contributing_factor_vehicle_2,
    "CONTRIBUTING FACTOR VEHICLE 3" as contributing_factor_vehicle_3,
    "CONTRIBUTING FACTOR VEHICLE 4" as contributing_factor_vehicle_4,
    "CONTRIBUTING FACTOR VEHICLE 5" as contributing_factor_vehicle_5,
    "COLLISION_ID" as collision_id,
    "VEHICLE TYPE CODE 1" as vehicle_type_code_1,
    "VEHICLE TYPE CODE 2" as vehicle_type_code_2,
    "VEHICLE TYPE CODE 3" as vehicle_type_code_3,
    "VEHICLE TYPE CODE 4" as vehicle_type_code_4,
    "VEHICLE TYPE CODE 5" as vehicle_type_code_5
FROM read_parquet('../_example_data/motor_colisions/*.parquet')
"""

In [23]:
from harpy.quack import QuackContext

with QuackContext() as q:
    query = q.sql(sql_exp)
    arrow_table = query.arrow(rows_per_batch=1000)

QuackContext entered
QuackContext exited


In [24]:
len(arrow_table.to_batches())

2127

In [7]:
session.close()

<harpy.session.Session at 0x7fa33818c160>