## Prerequisites

In [38]:
%pip install clickhouse-connect --quiet

Note: you may need to restart the kernel to use updated packages.


In [None]:
import clickhouse_connect
import os
import polars as pl
from dotenv import load_dotenv

load_dotenv()

host = os.getenv("CLICKHOUSE_HOST")
username = os.getenv("CLICKHOUSE_USER")
password = os.getenv("CLICKHOUSE_PASSWORD")

client = clickhouse_connect.get_client(
    host=host,
    port=8123,
    username=username,
    password=password,
)

In [None]:
df = pl.from_pandas(
    client.query_df(
        f"""
            SELECT *
            FROM s3(
                '{os.getenv("S3_AWS_URL_NODE_PORT")}/data-raw/Backblaze-Hard-Drive-Data/*/*.csv',
                '{os.getenv("S3_AWS_ACCESS_KEY_ID")}',
                '{os.getenv("S3_AWS_SECRET_ACCESS_KEY")}',
                'CSVWithNames'
            )
            LIMIT 5
    """
    )
)
df

date,serial_number,model,capacity_bytes,failure,datacenter,cluster_id,vault_id,pod_id,pod_slot_num,is_legacy_format,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,smart_4_raw,smart_5_normalized,smart_5_raw,smart_7_normalized,smart_7_raw,smart_8_normalized,smart_8_raw,smart_9_normalized,smart_9_raw,smart_10_normalized,smart_10_raw,smart_11_normalized,smart_11_raw,smart_12_normalized,smart_12_raw,smart_13_normalized,smart_13_raw,smart_15_normalized,smart_15_raw,…,smart_230_raw,smart_231_normalized,smart_231_raw,smart_232_normalized,smart_232_raw,smart_233_normalized,smart_233_raw,smart_234_normalized,smart_234_raw,smart_235_normalized,smart_235_raw,smart_240_normalized,smart_240_raw,smart_241_normalized,smart_241_raw,smart_242_normalized,smart_242_raw,smart_244_normalized,smart_244_raw,smart_245_normalized,smart_245_raw,smart_246_normalized,smart_246_raw,smart_247_normalized,smart_247_raw,smart_248_normalized,smart_248_raw,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
datetime[ns],str,str,i64,i64,str,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,i64,i64,str,str
2025-01-01 00:00:00,"""2207E60CC65A""","""CT250MX500SSD1""",250059350016,0,"""sac0""",0,1028,13,,"""False""",100,0,,,,,,,100,0,,,,,100,16790,,,,,100,3,,,,,…,,,,,,,,,,,,,,,,,,,,,,100.0,33742664104.0,100.0,959739128.0,100.0,1603481389.0,,,,,,,,,,
2025-01-01 00:00:00,"""2340E87B92B5""","""CT250MX500SSD1""",250059350016,0,"""sac0""",0,1028,14,,"""False""",100,0,,,,,,,100,0,,,,,100,3364,,,,,100,5,,,,,…,,,,,,,,,,,,,,,,,,,,,,100.0,6105396832.0,100.0,73589604.0,100.0,158733589.0,,,,,,,,,,
2025-01-01 00:00:00,"""2EGK64VX""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,4,12.0,"""False""",100,0,134.0,104.0,236.0,113.0,100.0,66.0,100,0,100.0,0.0,128.0,18.0,96,32538,100.0,0.0,,,100,61,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""2EHZAKAX""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,12,30.0,"""False""",100,0,134.0,105.0,155.0,418.0,100.0,23.0,100,0,100.0,0.0,128.0,18.0,90,71139,100.0,0.0,,,100,18,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""2EJ02A1X""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,10,14.0,"""False""",100,0,133.0,108.0,151.0,409.0,100.0,22.0,100,0,100.0,0.0,128.0,18.0,90,71183,100.0,0.0,,,100,17,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Loading data from s3

In [45]:
client.command(
    f"""
    CREATE TABLE dwh.backblaze_hard_drive_data
    ENGINE = MergeTree()
    ORDER BY assumeNotNull(date)
    AS SELECT *, FROM s3(
        '{os.getenv("S3_AWS_URL_NODE_PORT")}/data-raw/Backblaze-Hard-Drive-Data/*/*.csv',
        '{os.getenv("S3_AWS_ACCESS_KEY_ID")}',
        '{os.getenv("S3_AWS_SECRET_ACCESS_KEY")}',
        'CSVWithNames'
    )
    """
)

<clickhouse_connect.driver.summary.QuerySummary at 0x74ad16600d10>

In [48]:
df = pl.from_pandas(
    client.query_df(
        """
        SELECT * 
        FROM dwh.backblaze_hard_drive_data 
        LIMIT 10
        """
    )
)
df

date,serial_number,model,capacity_bytes,failure,datacenter,cluster_id,vault_id,pod_id,pod_slot_num,is_legacy_format,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,smart_4_raw,smart_5_normalized,smart_5_raw,smart_7_normalized,smart_7_raw,smart_8_normalized,smart_8_raw,smart_9_normalized,smart_9_raw,smart_10_normalized,smart_10_raw,smart_11_normalized,smart_11_raw,smart_12_normalized,smart_12_raw,smart_13_normalized,smart_13_raw,smart_15_normalized,smart_15_raw,…,smart_230_raw,smart_231_normalized,smart_231_raw,smart_232_normalized,smart_232_raw,smart_233_normalized,smart_233_raw,smart_234_normalized,smart_234_raw,smart_235_normalized,smart_235_raw,smart_240_normalized,smart_240_raw,smart_241_normalized,smart_241_raw,smart_242_normalized,smart_242_raw,smart_244_normalized,smart_244_raw,smart_245_normalized,smart_245_raw,smart_246_normalized,smart_246_raw,smart_247_normalized,smart_247_raw,smart_248_normalized,smart_248_raw,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
datetime[ns],str,str,i64,i64,str,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,i64,i64,str,str
2025-01-01 00:00:00,"""2207E60CC65A""","""CT250MX500SSD1""",250059350016,0,"""sac0""",0,1028,13,,"""False""",100,0,,,,,,,100.0,0.0,,,,,100,16790,,,,,100,3,,,,,…,,,,,,,,,,,,,,,,,,,,,,100.0,33742664104.0,100.0,959739128.0,100.0,1603481389.0,,,,,,,,,,
2025-01-01 00:00:00,"""2340E87B92B5""","""CT250MX500SSD1""",250059350016,0,"""sac0""",0,1028,14,,"""False""",100,0,,,,,,,100.0,0.0,,,,,100,3364,,,,,100,5,,,,,…,,,,,,,,,,,,,,,,,,,,,,100.0,6105396832.0,100.0,73589604.0,100.0,158733589.0,,,,,,,,,,
2025-01-01 00:00:00,"""2EGK64VX""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,4,12.0,"""False""",100,0,134.0,104.0,236.0,113.0,100.0,66.0,100.0,0.0,100.0,0.0,128.0,18.0,96,32538,100.0,0.0,,,100,61,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""2EHZAKAX""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,12,30.0,"""False""",100,0,134.0,105.0,155.0,418.0,100.0,23.0,100.0,0.0,100.0,0.0,128.0,18.0,90,71139,100.0,0.0,,,100,18,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""2EJ02A1X""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,10,14.0,"""False""",100,0,133.0,108.0,151.0,409.0,100.0,22.0,100.0,0.0,100.0,0.0,128.0,18.0,90,71183,100.0,0.0,,,100,17,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""7LZ021LA""","""Seagate BarraCuda SSD ZA250CM1…",250059350016,0,"""sac0""",0,1028,6,,"""False""",100,0,,,,,,,,,,,,,100,42538,,,,,100,15,,,,,…,,100.0,109951162777649.0,100.0,463856467968.0,100.0,120960.0,,,100.0,253672511712.0,,,100.0,65786.0,100.0,33891.0,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""S2ZYJ9CF511681""","""ST500LM012 HN""",500107862016,0,"""sac0""",0,1028,10,,"""False""",100,2815,252.0,0.0,95.0,1798.0,100.0,26.0,252.0,0.0,252.0,0.0,252.0,0.0,100,78798,252.0,0.0,90.0,11102.0,100,34,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""S2ZYJ9GGB01000""","""ST500LM012 HN""",500107862016,0,"""sac0""",0,1028,0,,"""False""",100,17,252.0,0.0,95.0,1794.0,100.0,11.0,252.0,0.0,252.0,0.0,252.0,0.0,100,62739,252.0,0.0,99.0,1412.0,100,12,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""S2ZYJ9GGB01001""","""ST500LM012 HN""",500107862016,0,"""sac0""",0,1028,4,,"""False""",100,2,252.0,0.0,94.0,1830.0,100.0,11.0,252.0,0.0,252.0,0.0,252.0,0.0,100,47676,252.0,0.0,100.0,19.0,100,11,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2025-01-01 00:00:00,"""S2ZYJ9GGB01020""","""ST500LM012 HN""",500107862016,0,"""sac0""",0,1028,2,,"""False""",100,60,252.0,0.0,95.0,1779.0,100.0,23.0,252.0,0.0,252.0,0.0,252.0,0.0,100,65007,252.0,0.0,98.0,2682.0,100,37,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [49]:
df = pl.from_pandas(
    client.query_df(
        """
        SELECT count(*) as total_rows
        FROM dwh.backblaze_hard_drive_data
        """
    )
)
df

total_rows
u64
56608028
