In [None]:
# https://github.com/gunnarmorling/1brc
# Thx for repo author/contributors for providing tools
# Copy 1brc github repo and generate txt file using python script (or any other way provided in their repo)
# Then copy into jupyter hub remote server
%ls -lh measurements.csv

-rw-r--r-- 1 jovyan users 15G Oct 18 17:01 measurements.csv


In [10]:
!head measurements.csv

Citrus Park;89.4
Katsina;17.3
Javānrūd;32.0
Khānah Sūr;47.7
Delmas;-71.7
Matadepera;-53.9
Gurupá;72.5
Çerkezköy;-44.9
Dylym;87.7
Kyzyl-Adyr;85.2


In [11]:
%pip install duckdb --quiet

Note: you may need to restart the kernel to use updated packages.


In [12]:
import os

import duckdb
from dotenv import load_dotenv

load_dotenv()
pass

In [13]:
con = duckdb.connect()
con.execute("install ducklake;")
con.execute("install postgres;")

con.execute(
    """
        CREATE SECRET (
            TYPE postgres,
            HOST '{host}',
            PORT {port},
            DATABASE {db},
            USER '{user}',
            PASSWORD '{password}'
        );
    """.format(
        host=os.getenv("POSTGRES_HOST"),
        port=os.getenv("POSTGRES_PORT"),
        user=os.getenv("DUCKLAKE_PG_LOGIN"),
        password=os.getenv("DUCKLAKE_PG_PASS"),
        db="ducklake_catalog",
    )
)

con.execute(
    """
        CREATE OR REPLACE SECRET secret (
            TYPE s3,
            ENDPOINT '{endpoint}',
            KEY_ID '{key_id}',
            SECRET '{secret}',
            URL_STYLE 'path',
            USE_SSL 'false'
        );
    """.format(
        endpoint="seaweedfs-s3.seaweedfs:8333",
        key_id=os.getenv("S3_AWS_ACCESS_KEY_ID"),
        secret=os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
    )
)

con.execute(
    """
        ATTACH 'ducklake:postgres:dbname=ducklake_catalog host={host}' AS my_ducklake
            (DATA_PATH 's3://ducklake/');
        USE my_ducklake;
    """.format(host=os.getenv("POSTGRES_HOST"))
)

<_duckdb.DuckDBPyConnection at 0x7303e03f7830>

In [None]:
is_created = (
    con.execute(
        """
            SELECT table_name, path
            FROM __ducklake_metadata_my_ducklake.ducklake_table
            WHERE table_name = 'brc_measurements' and end_snapshot IS NULL;
        """
    )
    .fetch_df()
    .shape[0]
    > 0
)

In [None]:
if not is_created:
    con.execute(
        """
            CREATE TABLE brc_measurements AS
            FROM read_csv('measurements.csv', delim=';', header=False, names=['station', 'measurement']);
        """
    ).fetch_df()

In [16]:
con.execute(
    """
        SELECT count(*)
        FROM brc_measurements;
    """
).fetchall()[0][0]

1000000000

In [None]:
%%time

# first run 1m 13s
# second run 27s
# third run 22s

con.execute(
    """
        SELECT 
            station,
            MIN(measurement) AS min_measurement,
            MAX(measurement) AS max_measurement,
            AVG(measurement) AS avg_measurement
        FROM brc_measurements
        GROUP BY station
        ORDER BY station;
    """
).fetch_df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

CPU times: user 20.8 s, sys: 0 ns, total: 20.8 s
Wall time: 21.9 s


Unnamed: 0,station,min_measurement,max_measurement,avg_measurement
0,A Yun Pa,-99.9,99.9,0.027729
1,Aadorf,-99.9,99.9,-0.068436
2,Aalsmeer,-99.9,99.9,0.183946
3,Aalten,-99.9,99.9,-0.148162
4,Aasiaat,-99.9,99.9,0.244987
...,...,...,...,...
8855,‘Anbarābād,-99.9,99.9,-0.233353
8856,‘Aynkāwah,-99.9,99.9,0.081314
8857,’Aïn Arnat,-99.9,99.9,-0.037782
8858,’Tlat Bni Oukil,-99.9,99.9,0.179272
