In [5]:
from pathlib import Path
import psycopg2
import time
from tqdm.auto import tqdm
import os

CSV_PATH = Path("/app/data/landing/accidents/US_Accidents_March23.csv")

conn = psycopg2.connect(
    dbname="etl_db",
    user="postgres",
    password="postgres",
    host="postgres",
    port=5432,
)

file_size = os.path.getsize(CSV_PATH)

start = time.time()

with conn.cursor() as cur:
    # Speed knobs
    cur.execute("SET synchronous_commit = OFF;")
    cur.execute("SET maintenance_work_mem = '1GB';")
    cur.execute("SET work_mem = '256MB';")

    print("Starting COPY...")

    with open(CSV_PATH, "rb") as f, tqdm(
        total=file_size,
        unit="B",
        unit_scale=True,
        desc="COPY Progress",
        colour="green"
    ) as pbar:

        class ProgressFile:
            def __init__(self, file, pbar):
                self.file = file
                self.pbar = pbar

            def read(self, size):
                data = self.file.read(size)
                self.pbar.update(len(data))
                return data

        wrapped_file = ProgressFile(f, pbar)

        cur.copy_expert(
            """
            COPY bronze.us_accidents
            FROM STDIN
            WITH (
                FORMAT CSV,
                HEADER TRUE,
                DELIMITER ',',
                QUOTE '"'
            )
            """,
            wrapped_file
        )

conn.commit()
conn.close()

elapsed = time.time() - start
print(f"\n✅ COPY complete in {elapsed:.2f} seconds")


Starting COPY...


COPY Progress:   0%|          | 0.00/3.06G [00:00<?, ?B/s]


✅ COPY complete in 48.09 seconds


In [None]:
# from sqlalchemy import text, create_engine

# engine = create_engine(
#     "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
# )

# with engine.connect() as conn:
#     result = conn.execute(
#         text("SELECT COUNT(*) FROM bronze.us_accidents")
#     )
#     db_count = result.scalar()

# print(f"Rows in bronze.us_accidents: {db_count:,}")

In [None]:
# from sqlalchemy import create_engine, text

# engine = create_engine(
#     "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db"
# )

# with engine.begin() as conn:
#     conn.execute(
#         text("TRUNCATE TABLE bronze.us_accidents")
#     )

# print("bronze.us_accidents truncated")

bronze.us_accidents truncated
