## Check database connectivity

In [22]:
%pip install connectorx adbc_driver_postgresql psycopg2-binary --quiet

Note: you may need to restart the kernel to use updated packages.


In [23]:
%pip install --upgrade s3fs --quiet

Note: you may need to restart the kernel to use updated packages.


In [24]:
import polars as pl
from dotenv import load_dotenv
import os
import boto3

load_dotenv()

POSTGRES_CONN_STR = os.getenv("POSTGRES_CONN_STR")

storage_options = {
    "endpoint_url": os.getenv("S3_AWS_URL"),
    "aws_access_key_id": os.getenv("S3_AWS_ACCESS_KEY_ID"),
    "aws_secret_access_key": os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
}

s3 = boto3.client(
    "s3",
    endpoint_url=os.getenv("S3_AWS_URL"),
    aws_access_key_id=os.getenv("S3_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
    verify=False,
)

In [25]:
print(
    "ADBC - ",
    "ok"
    if pl.read_database_uri(
        uri=POSTGRES_CONN_STR, query="SELECT 1 as test;", engine="adbc"
    )["test"].min()
    == 1
    else "fail",
    sep="",
)

print(
    "ConnectorX - ",
    "ok"
    if pl.read_database_uri(
        uri=POSTGRES_CONN_STR,
        query="SELECT 1 as test;",
        engine="connectorx",
    )["test"].min()
    == 1
    else "fail",
    sep="",
)

ADBC - ok
ConnectorX - ok


## Sample load speed for 1 csv file (~ 125MB)

In [26]:
objects = s3.list_objects_v2(
    Bucket="data-raw", Prefix="Backblaze-Hard-Drive-Data/").get("Contents", [])
print(f"{len(objects)} filles") 

181 filles


In [27]:
# ~1.2s
df = pl.scan_csv(
    f"s3://data-raw/{objects[0]['Key']}",
    storage_options=storage_options
).collect()

In [28]:
smart_columns = [col for col in df.columns if col.startswith(
    'smart_') and ('_raw' in col or '_normalized' in col)]

cols = [pl.col(col).cast(pl.Int64) for col in smart_columns]
cols.append(pl.col("date").cast(pl.Date))

df = df.with_columns(cols)

In [29]:
# default
# ~4s
df.write_database(
    connection=POSTGRES_CONN_STR,
    table_name="raw.hard_drive_data_test",
    if_table_exists="replace",
    engine="adbc",
)

304957

In [30]:
# default
# ~4s
df.write_database(
    connection=POSTGRES_CONN_STR,
    table_name="raw.tbl-green",
    if_table_exists="replace",
    engine="adbc",
)
df.write_database(
    connection=POSTGRES_CONN_STR,
    table_name="raw.tbl-blue",
    if_table_exists="replace",
    engine="adbc",
)

304957

## Load all data

In [11]:
objects = s3.list_objects_v2(
    Bucket="data-raw", Prefix="Backblaze-Hard-Drive-Data/").get("Contents", [])

df = pl.scan_csv(
    f"s3://data-raw/{objects[0]['Key']}",
    storage_options=storage_options
).collect()

smart_columns = [col for col in df.columns if col.startswith(
    'smart_') and ('_raw' in col or '_normalized' in col)]

cols = [pl.col(col).cast(pl.Int64) for col in smart_columns]
cols.append(pl.col("date").cast(pl.Date))

for (i, obj) in enumerate(objects):
    print(f"Processing {i+1}/{len(objects)}: {obj['Key']}")
    df = pl.scan_csv(
        f"s3://data-raw/{obj['Key']}",
        storage_options=storage_options
    ).with_columns(cols).collect()

    df.write_database(
        connection=POSTGRES_CONN_STR,
        table_name="raw.hard_drive_data",
        if_table_exists="replace" if i == 0 else "append",
        engine="adbc",
    )

Processing 1/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-01.csv
Processing 2/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-02.csv
Processing 3/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-03.csv
Processing 4/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-04.csv
Processing 5/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-05.csv
Processing 6/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-06.csv
Processing 7/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-07.csv
Processing 8/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-08.csv
Processing 9/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-09.csv
Processing 10/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-10.csv
Processing 11/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-11.csv
Processing 12/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-12.csv
Processing 13/181: Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-13.csv
Processing 14/181: Backblaze-Hard-Drive-Data/data_Q1_2025/20