clean

In [1]:
from sqlalchemy import create_engine, text

engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@postgres:5432/etl_db",
    pool_pre_ping=True
)


In [2]:
with engine.begin() as conn:
    conn.execute(text("TRUNCATE TABLE silver.us_accidents;"))


In [8]:
from sqlalchemy import text

insert_sql = """
INSERT INTO silver.us_accidents (
    accident_id,
    severity,

    start_time_utc,
    end_time_utc,
    duration_minutes,

    latitude,
    longitude,
    distance_mi,

    city,
    county,
    state,
    zipcode,

    weather_time_utc,
    temperature_f,
    wind_chill_f,
    humidity_pct,
    pressure_in,
    visibility_mi,
    wind_speed_mph,
    precipitation_in,
    weather_condition,

    is_amenity,
    is_bump,
    is_crossing,
    is_junction,
    is_railway,
    is_roundabout,
    is_station,
    is_stop,
    is_traffic_calming,
    is_traffic_signal,

    sunrise_sunset,
    civil_twilight,
    nautical_twilight,
    astronomical_twilight
)
SELECT
    id AS accident_id,
    severity::SMALLINT,

    (
        start_time AT TIME ZONE
        CASE timezone
            WHEN 'US/Pacific'  THEN 'America/Los_Angeles'
            WHEN 'US/Mountain' THEN 'America/Denver'
            WHEN 'US/Central'  THEN 'America/Chicago'
            WHEN 'US/Eastern'  THEN 'America/New_York'
            ELSE 'UTC'
        END
    ) AT TIME ZONE 'UTC' AS start_time_utc,

    (
        end_time AT TIME ZONE
        CASE timezone
            WHEN 'US/Pacific'  THEN 'America/Los_Angeles'
            WHEN 'US/Mountain' THEN 'America/Denver'
            WHEN 'US/Central'  THEN 'America/Chicago'
            WHEN 'US/Eastern'  THEN 'America/New_York'
            ELSE 'UTC'
        END
    ) AT TIME ZONE 'UTC' AS end_time_utc,

    CASE
        WHEN end_time IS NOT NULL
        THEN (
            EXTRACT(
                EPOCH FROM (
                    (
                        end_time AT TIME ZONE
                        CASE timezone
                            WHEN 'US/Pacific'  THEN 'America/Los_Angeles'
                            WHEN 'US/Mountain' THEN 'America/Denver'
                            WHEN 'US/Central'  THEN 'America/Chicago'
                            WHEN 'US/Eastern'  THEN 'America/New_York'
                            ELSE 'UTC'
                        END
                    )
                    -
                    (
                        start_time AT TIME ZONE
                        CASE timezone
                            WHEN 'US/Pacific'  THEN 'America/Los_Angeles'
                            WHEN 'US/Mountain' THEN 'America/Denver'
                            WHEN 'US/Central'  THEN 'America/Chicago'
                            WHEN 'US/Eastern'  THEN 'America/New_York'
                            ELSE 'UTC'
                        END
                    )
                )
            ) / 60
        )::INT
        ELSE NULL
    END AS duration_minutes,

    start_lat AS latitude,
    start_lng AS longitude,
    distance_mi,

    TRIM(city),
    TRIM(county),
    UPPER(state),
    zipcode,

    (
        weather_timestamp AT TIME ZONE
        CASE timezone
            WHEN 'US/Pacific'  THEN 'America/Los_Angeles'
            WHEN 'US/Mountain' THEN 'America/Denver'
            WHEN 'US/Central'  THEN 'America/Chicago'
            WHEN 'US/Eastern'  THEN 'America/New_York'
            ELSE 'UTC'
        END
    ) AT TIME ZONE 'UTC' AS weather_time_utc,

    temperature_f,
    wind_chill_f,
    humidity_pct,
    pressure_in,
    visibility_mi,
    wind_speed_mph,
    precipitation_in,
    weather_condition,

    COALESCE(amenity, false),
    COALESCE(bump, false),
    COALESCE(crossing, false),
    COALESCE(junction, false),
    COALESCE(railway, false),
    COALESCE(roundabout, false),
    COALESCE(station, false),
    COALESCE(stop, false),
    COALESCE(traffic_calming, false),
    COALESCE(traffic_signal, false),

    sunrise_sunset,
    civil_twilight,
    nautical_twilight,
    astronomical_twilight
FROM bronze.us_accidents
WHERE
    start_time IS NOT NULL
    AND start_lat IS NOT NULL
    AND start_lng IS NOT NULL
ON CONFLICT (accident_id) DO NOTHING;
"""


In [9]:
with engine.begin() as conn:
    conn.execute(text(insert_sql))


In [10]:
with engine.begin() as conn:
    conn.execute(text("ANALYZE silver.us_accidents;"))


In [11]:
with engine.connect() as conn:
    result = conn.execute(
        text("SELECT COUNT(*) FROM silver.us_accidents;")
    )
    print(f"Silver rows: {result.scalar():,}")


Silver rows: 7,728,394
