## Prerequisites

In [1]:
%pip install duckdb --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import duckdb
import os
from dotenv import load_dotenv

load_dotenv()
pass

In [3]:
con = duckdb.connect()
con.execute("install ducklake;")
con.execute("install postgres;")

<_duckdb.DuckDBPyConnection at 0x7b930f4fa430>

## Populate required postgres and seaweedfs secrets

In [4]:
con.execute(
    """
        CREATE SECRET (
            TYPE postgres,
            HOST '{host}',
            PORT {port},
            DATABASE {db},
            USER '{user}',
            PASSWORD '{password}'
        );
    """.format(
        host=os.getenv("POSTGRES_HOST"),
        port=os.getenv("POSTGRES_PORT"),
        user=os.getenv("DUCKLAKE_PG_LOGIN"),
        password=os.getenv("DUCKLAKE_PG_PASS"),
        db="ducklake_catalog",
    )
)

<_duckdb.DuckDBPyConnection at 0x7b930f4fa430>

In [5]:
con.execute(
    """
        CREATE OR REPLACE SECRET secret (
            TYPE s3,
            ENDPOINT '{endpoint}',
            KEY_ID '{key_id}',
            SECRET '{secret}',
            URL_STYLE 'path',
            USE_SSL 'false'
        );
    """.format(
        endpoint="seaweedfs-s3.seaweedfs:8333",
        key_id=os.getenv("S3_AWS_ACCESS_KEY_ID"),
        secret=os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
    )
)

<_duckdb.DuckDBPyConnection at 0x7b930f4fa430>

## Test Duckdb s3 read

In [6]:
con.execute(
    """
        SELECT count(*)
        FROM 's3://data-raw/Backblaze-Hard-Drive-Data/data_Q2_2025/2025-04-01.csv';
    """
).fetchall()

[(312812,)]

## Create/Attach DuckLake

In [7]:
con.execute(
    """
        ATTACH 'ducklake:postgres:dbname=ducklake_catalog host={host}' AS my_ducklake
            (DATA_PATH 's3://ducklake/');
        USE my_ducklake;
    """.format(host=os.getenv("POSTGRES_HOST"))
)

<_duckdb.DuckDBPyConnection at 0x7b930f4fa430>

## Tests on nl_stations

In [8]:
con.execute(
    """
        DROP TABLE IF EXISTS nl_train_stations;
    """
).fetchall()

[]

In [9]:
con.execute(
    """
        CREATE TABLE nl_train_stations AS
        FROM 'https://blobs.duckdb.org/nl_stations.csv';
    """
).fetchall()

[(578,)]

In [10]:
con.execute(
    """
    SELECT count(*)
    FROM nl_train_stations
    """
).fetchall()

[(578,)]

In [11]:
con.execute(
    """
    SELECT *
    FROM nl_train_stations
    LIMIT 5
    """
).fetch_df()

Unnamed: 0,id,code,uic,name_short,name_medium,name_long,slug,country,type,geo_lat,geo_lng
0,266,HT,8400319,Den Bosch,'s-Hertogenbosch,'s-Hertogenbosch,s-hertogenbosch,NL,knooppuntIntercitystation,51.69048,5.29362
1,269,HTO,8400320,Dn Bosch O,'s-Hertogenb. O.,'s-Hertogenbosch Oost,s-hertogenbosch-oost,NL,stoptreinstation,51.700554,5.318333
2,227,HDE,8400388,'t Harde,'t Harde,'t Harde,t-harde,NL,stoptreinstation,52.409168,5.893611
3,8,AHBF,8015345,Aachen,Aachen Hbf,Aachen Hbf,aachen-hbf,D,knooppuntIntercitystation,50.7678,6.091499
4,818,AW,8015199,Aachen W,Aachen West,Aachen West,aachen-west,D,stoptreinstation,50.78036,6.070715


## Load Backblaze-Hard-Drive-Data into DuckLake

In [15]:
con.execute(
    """
        CREATE TABLE hard_drive_data AS
        SELECT * FROM read_csv('s3://data-raw/Backblaze-Hard-Drive-Data/*/*.csv');
    """
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x7b930f4fa430>

In [17]:
con.execute(
    """
        SELECT count(*)
        FROM hard_drive_data;
    """
).fetchall()

[(56608028,)]