In [None]:
import pandas
import numpy
import math

Fetch data from https://catalog.data.gov/dataset/nypd-motor-vehicle-collisions-07420

In [None]:
df_raw = pandas.read_csv("nypd_motor_vehicle.csv")

In [None]:
import io
borough_pops =  pandas.read_csv(io.StringIO(
"""Name,Status,Population_1990,Population_2000,Population_2010, Population_2016_estimate
BRONX,Borough,1203789,1332244,1385107,1455720
BROOKLYN,Borough,2300664,2465689,2504706,2629150
MANHATTAN,Borough,1487536,1538096,1585874,1643734
QUEENS,Borough,1951598,2229394,2230545,2333054
STATEN ISLAND,Borough,378977,443762,468730,476015
"""))

In [None]:
df = df_raw.copy()
df = df.join(df.DATE.str.extract('(?P<MONTH>\d+)\/(?P<DAY>\d+)\/(?P<YEAR>\d+)', expand=True))
df = df.join(df.TIME.str.extract('(?P<HOUR>\d+):(?P<MINUTE>\d+)', expand=True))

In [None]:
df = df.astype({
    "YEAR" : int,
    "MONTH" : int,
    "DAY" : int,
    "HOUR" : int,
    "MINUTE" : int,
})

In [None]:
pandas.read_csv()

In [None]:
dfs = df[(df[[c for c in df.columns if c.startswith("NUMBER OF")]] != 0).any(axis=1)]

In [None]:
def nan_to_none(v):
    if isinstance(v, float) and math.isnan(v):
        return None
    elif isinstance(v, str):
        return v.strip()
    else:
        return v

In [None]:
def to_rec(r):
    res = {
        "time" : "%(YEAR)4i-%(MONTH)02i-%(DAY)02iT%(HOUR)02i:%(MINUTE)02i" % r,
        "borough" : nan_to_none(r["BOROUGH"]),
        "vehicles" : [],
        "injured" : {
            "persons" : r["NUMBER OF PERSONS INJURED"],
            "pedestrians" : r["NUMBER OF PEDESTRIANS INJURED"],
            "cyclists" : r["NUMBER OF CYCLIST INJURED"],
            "motorists" : r["NUMBER OF MOTORIST INJURED"],
        },
        "killed" : {
            "persons" : r["NUMBER OF PERSONS KILLED"],
            "pedestrians" : r["NUMBER OF PEDESTRIANS KILLED"],
            "cyclists" : r["NUMBER OF CYCLIST KILLED"],
            "motorists" : r["NUMBER OF MOTORIST KILLED"],
        },
        "location" : {
            "lat" : r["LATITUDE"],
            "lon" : r["LONGITUDE"],
        },
        "zip" : "%05i" % r["ZIP CODE"] if nan_to_none(r["ZIP CODE"]) else None,
        "street" : {
            "off" : nan_to_none(r['OFF STREET NAME']),
            "on" : nan_to_none(r['ON STREET NAME']),
            "cross" : nan_to_none(r['CROSS STREET NAME']),
        }
    }
    
    for i in range(1, 6):
        v = {
            "type" : r["VEHICLE TYPE CODE %s" % i],
            "factor" : r["CONTRIBUTING FACTOR VEHICLE %s" % i],
        }
        
        if nan_to_none(v["type"]) or nan_to_none(v["factor"]):
            res["vehicles"].append(v)
            
    return res

In [None]:
import dicttoxml
import json

In [None]:
!mkdir source_data
!mkdir source_data/2014
!mkdir source_data/2015
!mkdir source_data/2016
!mkdir source_data/2017

In [None]:
year_groups = dfs.groupby('YEAR')

In [None]:
year_groups.size()

In [None]:
dfs.to_sql?

In [None]:
year_groups.get_group(2015)[df_raw.columns].to_csv("source_data/2015/nypd_motor_vehicle.csv", index=False)

In [None]:
for (year, month), data in dfs.query("YEAR == 2016").groupby(["YEAR", "MONTH"]):
    rows = [to_rec(r) for _, r in data.iterrows()]
    json.dump(rows, open("source_data/2016/nypd_motor_vehicle.%04i.%02i.json" % (year, month), "w"), indent=2)

In [None]:
for (year, month), data in dfs.query("YEAR == 2017").groupby(["YEAR", "MONTH"]):
    rows = [to_rec(r) for _, r in data.iterrows()]
    with open("source_data/2017/nypd_motor_vehicle.%04i.%02i.xml" % (year, month), "w") as o:
        xml = dicttoxml.dicttoxml(rows, custom_root="motor_vehicle_incidents")
        o.write(xml.decode())

In [None]:
collisions = dfs

In [None]:
%load_ext sql

In [None]:
%sql sqlite:///nypd_collisions.db

In [None]:
%sql PERSIST collisions

In [None]:
import io
borough_pops = pandas.read_csv(io.StringIO(
"""Name,Status,Population_1990,Population_2000,Population_2010,Population_2016_est
BRONX,Borough,1203789,1332244,1385107,1455720
BROOKLYN,Borough,2300664,2465689,2504706,2629150
MANHATTAN,Borough,1487536,1538096,1585874,1643734
QUEENS,Borough,1951598,2229394,2230545,2333054
STATEN ISLAND,Borough,378977,443762,468730,476015
"""))

In [None]:
%sql PERSIST borough_pops