In [1]:
import duckdb
import pandas as pd

In [3]:
df = pd.read_parquet("crashes.parquet")
df.tail()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
2201275,2025-08-25,07:30:00,QUEENS,40.721184,-73.90361,1.0,0.0,Unspecified,,,,,Station Wagon/Sport Utility Vehicle,,,,
2201276,2025-08-25,11:40:00,BRONX,40.859406,-73.841866,0.0,0.0,Other Vehicular,,,,,Station Wagon/Sport Utility Vehicle,,,,
2201277,2025-08-25,03:50:00,BRONX,40.80423,-73.876434,1.0,0.0,Unspecified,,,,,Station Wagon/Sport Utility Vehicle,,,,
2201278,2025-08-25,09:45:00,STATEN ISLAND,40.539875,-74.19287,1.0,0.0,Traffic Control Disregarded,Unspecified,,,,Sedan,Sedan,,,
2201279,2025-08-25,23:00:00,BROOKLYN,40.690456,-73.91175,1.0,0.0,Driver Inexperience,Other Vehicular,Other Vehicular,,,Sedan,Sedan,Sedan,,


In [2]:
con = duckdb.connect("crashes.duckdb")

In [2]:
con = duckdb.connect("crashes.duckdb")

# Create a table inside DuckDB from the Parquet file
con.execute("""
    CREATE OR REPLACE TABLE crashes AS 
    SELECT 
        BOROUGH,
        EXTRACT('HOUR' FROM "CRASH TIME") AS HOUR,
        DAYNAME("CRASH DATE") AS WEEKDAY,
        EXTRACT(YEAR FROM "CRASH DATE") AS YEAR,
        COALESCE("NUMBER OF PERSONS KILLED", 0) AS PERSONS_KILLED,
        COALESCE("NUMBER OF PERSONS INJURED", 0) AS PERSONS_INJURED,
        "CONTRIBUTING FACTOR VEHICLE 1",
        "CONTRIBUTING FACTOR VEHICLE 2",
        "CONTRIBUTING FACTOR VEHICLE 3",
        "CONTRIBUTING FACTOR VEHICLE 4",
        "CONTRIBUTING FACTOR VEHICLE 5",
        "VEHICLE TYPE CODE 1",
        "VEHICLE TYPE CODE 2",
        "VEHICLE TYPE CODE 3",
        "VEHICLE TYPE CODE 4",
        "VEHICLE TYPE CODE 5",
        LATITUDE,
        LONGITUDE
    FROM 'crashes.parquet'
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1fa5062feb0>

In [None]:
print(con.execute("SELECT COUNT(*) FROM crashes").fetchall())

In [None]:
# create a persistent connection to the DuckDB file
# con = duckdb.connect("crashes.duckdb", read_only=True)

In [3]:
df = con.execute("SELECT * FROM crashes").df()
df.head()

Unnamed: 0,BOROUGH,HOUR,WEEKDAY,YEAR,PERSONS_KILLED,PERSONS_INJURED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5
0,BROOKLYN,1,Wednesday,2023,0.0,1.0,Unspecified,Unspecified,Unspecified,,
1,BROOKLYN,9,Saturday,2021,0.0,0.0,Unspecified,,,,
2,BROOKLYN,8,Tuesday,2021,0.0,0.0,,,,,
3,BROOKLYN,17,Tuesday,2021,0.0,0.0,Passing Too Closely,Unspecified,,,
4,BRONX,8,Tuesday,2021,0.0,2.0,Unspecified,Unspecified,,,


In [None]:
df.shape

#### vehicles

In [14]:
df = con.execute("""
WITH all_vehicles AS (
    SELECT BOROUGH, Year, "VEHICLE TYPE CODE 1" AS vehicles
    FROM crashes
    WHERE "VEHICLE TYPE CODE 1" IS NOT NULL AND "VEHICLE TYPE CODE 1" <> ''
    UNION ALL
    SELECT BOROUGH, Year, "VEHICLE TYPE CODE 2" AS vehicles
    FROM crashes
    WHERE "VEHICLE TYPE CODE 2" IS NOT NULL AND "VEHICLE TYPE CODE 2" <> ''
    UNION ALL
    SELECT BOROUGH, Year, "VEHICLE TYPE CODE 3" AS vehicles
    FROM crashes
    WHERE "VEHICLE TYPE CODE 3" IS NOT NULL AND "VEHICLE TYPE CODE 3" <> ''
    UNION ALL
    SELECT BOROUGH, Year, "VEHICLE TYPE CODE 4" AS vehicles
    FROM crashes
    WHERE "VEHICLE TYPE CODE 4" IS NOT NULL AND "VEHICLE TYPE CODE 4" <> ''
    UNION ALL
    SELECT BOROUGH, Year, "VEHICLE TYPE CODE 5" AS vehicles
    FROM crashes
    WHERE "VEHICLE TYPE CODE 5" IS NOT NULL AND "VEHICLE TYPE CODE 5" <> ''
)
SELECT
    vehicles,
    COUNT(*) AS count
FROM all_vehicles
GROUP BY vehicles
ORDER BY count DESC
""").df()
df.head()

Unnamed: 0,vehicles,count
0,Sedan,1061179
1,Station Wagon/Sport Utility Vehicle,841007
2,PASSENGER VEHICLE,639622
3,SPORT UTILITY / STATION WAGON,281909
4,Taxi,92033


#### time

In [None]:
df = con.execute("""
    SELECT 
        (HOUR+1) AS HOUR,
        COUNT(*) AS counts
    FROM crashes
    GROUP BY HOUR
    ORDER BY HOUR
""").df()

df.head(25)

Unnamed: 0,HOUR,HOURS,counts
0,1,15,66335
1,2,30,35242
2,3,45,27223
3,4,60,24043
4,5,75,27150
5,6,90,29188
6,7,105,44376
7,8,120,61090
8,9,135,109058
9,10,150,104550


In [None]:
import plotly.express as px

df["theta_deg"] = df["HOUR"] * (360 / 24)
df["label"] = df["HOUR"].apply(lambda h: f"{int(h):02d}:00")

fig = px.bar_polar(
    df,
    r="counts",
    theta="theta_deg",
    color="counts",
    color_continuous_scale=px.colors.sequential.Plasma_r,
    hover_name="label",
    hover_data=["counts"],
)

fig.update_layout(
    polar=dict(
        angularaxis=dict(
            direction="clockwise",
            rotation=90,
            tickmode="array",
            tickvals=[h * 15 for h in range(0, 24, 1)],
            ticktext=[f"{h:02d}:00" for h in range(0, 24, 1)],
        )
    ),
    showlegend=False,
)
fig.show()

In [33]:
import plotly.express as px

fig = px.bar_polar(
    df, r="counts", theta="HOUR", color_discrete_sequence=px.colors.sequential.Plasma_r
)
fig.show()

In [36]:
import plotly.express as px

df = px.data.wind()
df.head()

Unnamed: 0,direction,strength,frequency
0,N,0-1,0.5
1,NNE,0-1,0.6
2,NE,0-1,0.5
3,ENE,0-1,0.4
4,E,0-1,0.4


In [30]:
df.shape

(128, 3)

#### day

In [9]:
df = con.execute("""
    SELECT WEEKDAY, COUNT(WEEKDAY) AS counts
    FROM crashes
    GROUP BY WEEKDAY
    ORDER BY counts DESC;
 """).df()

df.head(7)

Unnamed: 0,WEEKDAY,counts
0,Friday,316505
1,Thursday,296651
2,Tuesday,292430
3,Wednesday,290171
4,Monday,284488
5,Saturday,268874
6,Sunday,239946


#### word

In [None]:
df = con.execute("""
    WITH factors AS (
    SELECT "CONTRIBUTING FACTOR VEHICLE 1" AS reasons
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 2"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 3"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 4"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 5"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
)
SELECT reasons, COUNT(*) AS Count
FROM factors
WHERE reasons IS NOT NULL
GROUP BY reasons
ORDER BY Count DESC
LIMIT 25;
 """).df()

In [None]:
df = con.execute("""
    WITH factors AS (
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 1" AS reasons
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 2"
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 3"
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 4"
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 5"
    FROM crashes
),
reasons AS (
    SELECT Year, reasons
    FROM factors
    WHERE YEAR BETWEEN 2020 AND 2021
),
words AS (
    SELECT reasons, COUNT(reasons) AS Count
    FROM reasons
    GROUP BY reasons
    ORDER BY Count DESC
) 
SELECT * 
FROM words
""").df()

In [None]:
df.head(30)

In [None]:
df.shape

#### kpi

In [None]:
kpi_query = con.execute("""
    SELECT 
        COUNT(*) AS total_collisions, 
        SUM(PERSONS_KILLED) AS persons_killed, 
        SUM(PERSONS_INJURED) AS persons_injured
    FROM crashes
""").fetchone()

total_collisions, persons_killed, persons_injured = map(int, kpi_query)