link: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2019-01.csv.gz

Need to convert csv to .parquet


In [None]:
import duckdb
import pathlib
import os
import pandas as pd
import time, psutil

In [None]:
raw_path = './data/raw/yellow_tripdata_2019-01.csv'
parquet_path = './data/parquet/yellow_tripdata_2019-01.parquet'
os.makedirs('data/parquet', exist_ok=True)

In [None]:
duckdb.sql(f"""
    COPY (SELECT * FROM read_csv_auto('{raw_path}'))
    TO '{parquet_path}'
    (FORMAT 'parquet', CODEC 'snappy')
""")

In [None]:
con = duckdb.connect()
query = """
    WITH trips AS (
        SELECT
            passenger_count,
            tip_amount,
            trip_distance,
            tpep_pickup_datetime AS pickup_ts
        FROM read_parquet('data/parquet/yellow_tripdata_2019-01.parquet')
        WHERE trip_distance > 2
    )
    SELECT
        passenger_count,
        AVG(tip_amount) AS avg_tip,
        COUNT(*)        AS trips
    FROM trips
    GROUP BY passenger_count
    ORDER BY passenger_count;
"""


duckdf = con.sql(query).df()

In [None]:
duckdf

In [None]:


def run_benchmark(f, label):
    t0 = time.perf_counter()
    proc = psutil.Process()
    mem_before = proc.memory_info().rss
    out = f()
    mem_after  = proc.memory_info().rss
    print(
        f"{label:<7} | time {time.perf_counter()-t0:6.2f}s "
        f"| ΔRAM {(mem_after-mem_before)/1e6:6.1f} MB"
    )
    return out

run_benchmark(lambda: con.sql(query).df(), "DuckDB")


Aggregation and getting result took 0.02s, let's compare to:

DuckDB  | time   0.03s | ΔRAM    1.2 MB



Duck to Pandas df

In [None]:
def load_full_df():
    return con.sql(f"SELECT * FROM read_parquet('{parquet_path}')").df()

run_benchmark(load_full_df, 'Duck -> pandas full')

Full pandas

In [None]:
import pandas as pd
import psutil 
import time

COLS  = ["passenger_count", "tip_amount", "trip_distance"]

def pandas_full():
    df = pd.read_csv(raw_path,
                     usecols=COLS,
                     dtype={"passenger_count": "Int8"})   # tiny int saves RAM
    df = df[df.trip_distance > 2]
    out = (df
           .groupby("passenger_count", observed=True)
           .agg(avg_tip = ("tip_amount", "mean"),
                trips   = ("tip_amount", "size"))
           .reset_index()
           .sort_values("passenger_count"))
    return out


Pandas chunked

In [None]:

from collections import defaultdict

def pandas_chunked(chunksize=1_000_000):
    totals = defaultdict(lambda: [0.0, 0])   # key → [tip_sum, trip_count]

    for chunk in pd.read_csv(raw_path,
                             usecols=COLS,
                             dtype={"passenger_count": "Int8"},
                             chunksize=chunksize):
        chunk = chunk[chunk.trip_distance > 2]
        gb = (chunk
              .groupby("passenger_count", observed=True)["tip_amount"]
              .agg(["sum", "count"]))
        for pc, row in gb.itertuples():
            totals[pc][0] += row.sum
            totals[pc][1] += row.count

    # collapse dict → tidy DataFrame
    out = (pd.DataFrame({
             "passenger_count": list(totals.keys()),
             "avg_tip": [totals[k][0] / totals[k][1] for k in totals],
             "trips"  : [totals[k][1]                for k in totals]
           })
           .sort_values("passenger_count")
           .reset_index(drop=True))
    return out


In [None]:
run_benchmark(pandas_full,    "Pandas ALL")
#run_benchmark(pandas_chunked, "Pandas CHUNK") 

Pandas ALL | time   3.15s | ΔRAM  475.3 MB


Final results:

DuckDB  | time   0.03s | ΔRAM    1.2 MB


Pandas ALL | time   3.15s | ΔRAM  475.3 MB


Pandas CHUNK | time   DNF | ΔRAM  DNF

Duck -> pandas full | time   0.92s | ΔRAM   29.7 MB