## Pre-requisites

In [2]:
# make sure those installed
%pip install requests zipfile36 --quiet

Note: you may need to restart the kernel to use updated packages.


In [3]:
import io
import os
import shutil
import zipfile

import boto3
import pandas as pd
import polars as pl
import requests
import s3fs
import urllib3
from dotenv import load_dotenv

load_dotenv()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [4]:
# run from local .env or external .env populated in users jupyter (connected to jupyterhub)

# Create S3 client
s3 = boto3.client(
    "s3",
    endpoint_url=os.getenv("S3_AWS_URL"),
    aws_access_key_id=os.getenv("S3_AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
    verify=False,
)

s3.list_objects_v2(Bucket='data-raw').get('KeyCount', 0)

181

## Load Backblaze-Hard-Drive-Data data to S3

In [5]:
download = False
if download:
    # 1GB zip files, 10GB unzipped
    # will load only 20GB for now
    hard_drive_data_urls = [
        "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2025.zip",
        "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2025.zip",
        # "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2024.zip",
        # "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2024.zip",
        # "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2024.zip"
        # "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2024.zip"
    ]

    TMP_DIR = 'tmp'
    DEST_BUCKET = 'data-raw'
    ROOT_FOLDER = 'Backblaze-Hard-Drive-Data'
    os.makedirs(TMP_DIR, exist_ok=True)

    for url in hard_drive_data_urls:
        print(f"Processing {url}...")
        r = requests.get(url)

        print("Extracting to S3 ...")
        with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
            for member in zf.infolist():
                file_key = f"{ROOT_FOLDER}/{member.filename}"

                if s3.list_objects_v2(Bucket=DEST_BUCKET, Prefix=file_key).get('KeyCount', 0) > 0:
                    print(f"File {file_key} already in S3. Skipping upload.")
                    continue

                try:
                    zf.extract(member, f"{TMP_DIR}", )
                    s3.upload_file(
                        Filename=f"{TMP_DIR}/{member.filename}",
                        Bucket=DEST_BUCKET,
                        Key=file_key
                    )
                    os.remove(f"{TMP_DIR}/{member.filename}")
                except Exception as e:
                    print(f"Error extracting {member.filename}: {e}")

                print(f"{member.filename} uploaded")

    shutil.rmtree('tmp')

## Read sample from S3

In [6]:
fs = s3fs.S3FileSystem(
    client_kwargs={"endpoint_url": os.getenv("S3_AWS_URL")},
    key=os.getenv("S3_AWS_ACCESS_KEY_ID"),
    secret=os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
)

df = pd.read_csv(
    fs.open("s3://data-raw/Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-01.csv")
)

df

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,datacenter,cluster_id,vault_id,pod_id,pod_slot_num,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2025-01-01,2207E60CC65A,CT250MX500SSD1,250059350016,0,sac0,0,1028,13,,...,,,,,,,,,,
1,2025-01-01,2340E87B92B5,CT250MX500SSD1,250059350016,0,sac0,0,1028,14,,...,,,,,,,,,,
2,2025-01-01,2EGK64VX,HGST HUH728080ALE604,8001563222016,0,sac0,0,1028,4,12.0,...,,,,,,,,,,
3,2025-01-01,2EHZAKAX,HGST HUH728080ALE604,8001563222016,0,sac0,0,1028,12,30.0,...,,,,,,,,,,
4,2025-01-01,2EJ02A1X,HGST HUH728080ALE604,8001563222016,0,sac0,0,1028,10,14.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304952,2025-01-01,3ZHG6U1Z,WDC WUH722222ALE6L4,22000969973760,0,yyz1,60,1002,4,48.0,...,,,,,,,,,,
304953,2025-01-01,3ZHG6U9Z,WDC WUH722222ALE6L4,22000969973760,0,yyz1,60,1002,2,24.0,...,,,,,,,,,,
304954,2025-01-01,ZGG6LV5B,WDC WUH722222ALE6L4,22000969973760,0,yyz1,60,1002,11,8.0,...,,,,,,,,,,
304955,2025-01-01,ZGG7J1TA,WDC WUH722222ALE6L4,22000969973760,0,yyz1,60,1002,7,0.0,...,,,,,,,,,,


In [7]:
storage_options = {
    "endpoint_url": os.getenv("S3_AWS_URL"),
    "aws_access_key_id": os.getenv("S3_AWS_ACCESS_KEY_ID"),
    "aws_secret_access_key": os.getenv("S3_AWS_SECRET_ACCESS_KEY"),
}

df = pl.scan_csv(
    "s3://data-raw/Backblaze-Hard-Drive-Data/data_Q1_2025/2025-01-01.csv",
    storage_options=storage_options,
).collect()

df

date,serial_number,model,capacity_bytes,failure,datacenter,cluster_id,vault_id,pod_id,pod_slot_num,is_legacy_format,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,smart_3_raw,smart_4_normalized,smart_4_raw,smart_5_normalized,smart_5_raw,smart_7_normalized,smart_7_raw,smart_8_normalized,smart_8_raw,smart_9_normalized,smart_9_raw,smart_10_normalized,smart_10_raw,smart_11_normalized,smart_11_raw,smart_12_normalized,smart_12_raw,smart_13_normalized,smart_13_raw,smart_15_normalized,smart_15_raw,…,smart_230_raw,smart_231_normalized,smart_231_raw,smart_232_normalized,smart_232_raw,smart_233_normalized,smart_233_raw,smart_234_normalized,smart_234_raw,smart_235_normalized,smart_235_raw,smart_240_normalized,smart_240_raw,smart_241_normalized,smart_241_raw,smart_242_normalized,smart_242_raw,smart_244_normalized,smart_244_raw,smart_245_normalized,smart_245_raw,smart_246_normalized,smart_246_raw,smart_247_normalized,smart_247_raw,smart_248_normalized,smart_248_raw,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
str,str,str,i64,i64,str,i64,i64,i64,i64,bool,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,…,str,i64,i64,i64,i64,i64,i64,str,str,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str,str,str,str,str
"""2025-01-01""","""2207E60CC65A""","""CT250MX500SSD1""",250059350016,0,"""sac0""",0,1028,13,,false,100,0,,,,,,,100,0,,,,,100,16790,,,,,100,3,,,,,…,,,,,,,,,,,,,,,,,,,,,,100,33742664104,100,959739128,100,1603481389,,,,,,,,,,
"""2025-01-01""","""2340E87B92B5""","""CT250MX500SSD1""",250059350016,0,"""sac0""",0,1028,14,,false,100,0,,,,,,,100,0,,,,,100,3364,,,,,100,5,,,,,…,,,,,,,,,,,,,,,,,,,,,,100,6105396832,100,73589604,100,158733589,,,,,,,,,,
"""2025-01-01""","""2EGK64VX""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,4,12,false,100,0,134,104,236,113,100,66,100,0,100,0,128,18,96,32538,100,0,,,100,61,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2025-01-01""","""2EHZAKAX""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,12,30,false,100,0,134,105,155,418,100,23,100,0,100,0,128,18,90,71139,100,0,,,100,18,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2025-01-01""","""2EJ02A1X""","""HGST HUH728080ALE604""",8001563222016,0,"""sac0""",0,1028,10,14,false,100,0,133,108,151,409,100,22,100,0,100,0,128,18,90,71183,100,0,,,100,17,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2025-01-01""","""3ZHG6U1Z""","""WDC WUH722222ALE6L4""",22000969973760,0,"""yyz1""",60,1002,4,48,false,100,0,148,49,87,275,100,9,100,0,100,0,140,15,100,1196,100,0,,,100,9,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2025-01-01""","""3ZHG6U9Z""","""WDC WUH722222ALE6L4""",22000969973760,0,"""yyz1""",60,1002,2,24,false,100,0,148,49,87,240,100,9,100,0,100,0,140,15,100,1182,100,0,,,100,9,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2025-01-01""","""ZGG6LV5B""","""WDC WUH722222ALE6L4""",22000969973760,0,"""yyz1""",60,1002,11,8,false,100,0,148,49,87,267,100,9,100,0,100,0,140,15,100,1193,100,0,,,100,9,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2025-01-01""","""ZGG7J1TA""","""WDC WUH722222ALE6L4""",22000969973760,0,"""yyz1""",60,1002,7,0,false,100,0,147,52,85,337,100,10,100,0,100,0,140,15,100,1190,100,0,,,100,10,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
