In [4]:
import duckdb

In [5]:
con = duckdb.connect("crashes.duckdb")

# Create a table inside DuckDB from the Parquet file
con.execute("""
    CREATE OR REPLACE TABLE crashes AS 
    SELECT 
        BOROUGH,
        EXTRACT(YEAR FROM "CRASH DATE") AS YEAR,
        COALESCE("NUMBER OF PERSONS KILLED", 0) AS PERSONS_KILLED,
        COALESCE("NUMBER OF PERSONS INJURED", 0) AS PERSONS_INJURED,
        "CONTRIBUTING FACTOR VEHICLE 1",
        "CONTRIBUTING FACTOR VEHICLE 2",
        "CONTRIBUTING FACTOR VEHICLE 3",
        "CONTRIBUTING FACTOR VEHICLE 4",
        "CONTRIBUTING FACTOR VEHICLE 5"
    FROM 'crashes.parquet'
""")

<duckdb.duckdb.DuckDBPyConnection at 0x23f245b0270>

In [None]:
print(con.execute("SELECT COUNT(*) FROM crashes").fetchall())

In [None]:
# create a persistent connection to the DuckDB file
# con = duckdb.connect("crashes.duckdb", read_only=True)

In [None]:
df = con.execute("SELECT * FROM crashes").df()
df.head()

In [None]:
df.shape

#### word

In [None]:
df = con.execute("""
    WITH factors AS (
    SELECT "CONTRIBUTING FACTOR VEHICLE 1" AS reasons
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 2"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 3"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 4"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
    UNION ALL
    SELECT "CONTRIBUTING FACTOR VEHICLE 5"
    FROM crashes
    WHERE BOROUGH IN ? AND YEAR BETWEEN ? AND ?
)
SELECT reasons, COUNT(*) AS Count
FROM factors
WHERE reasons IS NOT NULL
GROUP BY reasons
ORDER BY Count DESC
LIMIT 25;
 """).df()

In [11]:
df = con.execute("""
    WITH factors AS (
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 1" AS reasons
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 2"
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 3"
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 4"
    FROM crashes
    UNION ALL
    SELECT Year, "CONTRIBUTING FACTOR VEHICLE 5"
    FROM crashes
),
reasons AS (
    SELECT Year, reasons
    FROM factors
    WHERE YEAR BETWEEN 2020 AND 2021
),
words AS (
    SELECT reasons, COUNT(reasons) AS Count
    FROM reasons
    GROUP BY reasons
    ORDER BY Count DESC
) 
SELECT * 
FROM words
""").df()

In [12]:
df.head(30)

Unnamed: 0,reasons,Count
0,Unspecified,219827
1,Driver Inattention/Distraction,61651
2,Failure to Yield Right-of-Way,15330
3,Following Too Closely,15011
4,Passing or Lane Usage Improper,9868
5,Passing Too Closely,9036
6,Other Vehicular,8982
7,Backing Unsafely,7822
8,Unsafe Speed,7614
9,Traffic Control Disregarded,6216


In [13]:
df.shape

(56, 2)

#### kpi

In [None]:
kpi_query = con.execute("""
    SELECT 
        COUNT(*) AS total_collisions, 
        SUM(PERSONS_KILLED) AS persons_killed, 
        SUM(PERSONS_INJURED) AS persons_injured
    FROM crashes
""").fetchone()

total_collisions, persons_killed, persons_injured = map(int, kpi_query)