In [1]:
import pandas
import numpy
import math

In [2]:
df_raw = pandas.read_csv("nypd_motor_vehicle.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df = df_raw.copy()
df = df.join(df.DATE.str.extract('(?P<MONTH>\d+)\/(?P<DAY>\d+)\/(?P<YEAR>\d+)', expand=True))
df = df.join(df.TIME.str.extract('(?P<HOUR>\d+):(?P<MINUTE>\d+)', expand=True))

In [4]:
df = df.astype({
    "YEAR" : int,
    "MONTH" : int,
    "DAY" : int,
    "HOUR" : int,
    "MINUTE" : int,
})

In [5]:
dfs = df[(df[[c for c in df.columns if c.startswith("NUMBER OF")]] != 0).any(axis=1)]

In [6]:
def nan_to_none(v):
    if isinstance(v, float) and math.isnan(v):
        return None
    elif isinstance(v, str):
        return v.strip()
    else:
        return v

In [7]:
def to_rec(r):
    res = {
        "time" : "%(YEAR)4i-%(MONTH)02i-%(DAY)02iT%(HOUR)02i:%(MINUTE)02i" % r,
        "borough" : nan_to_none(r["BOROUGH"]),
        "vehicles" : [],
        "injured" : {
            "persons" : r["NUMBER OF PERSONS INJURED"],
            "pedestrians" : r["NUMBER OF PEDESTRIANS INJURED"],
            "cyclists" : r["NUMBER OF CYCLIST INJURED"],
            "motorists" : r["NUMBER OF MOTORIST INJURED"],
        },
        "killed" : {
            "persons" : r["NUMBER OF PERSONS KILLED"],
            "pedestrians" : r["NUMBER OF PEDESTRIANS KILLED"],
            "cyclists" : r["NUMBER OF CYCLIST KILLED"],
            "motorists" : r["NUMBER OF MOTORIST KILLED"],
        },
        "location" : {
            "lat" : r["LATITUDE"],
            "lon" : r["LONGITUDE"],
        },
        "zip" : "%05i" % r["ZIP CODE"] if nan_to_none(r["ZIP CODE"]) else None,
        "street" : {
            "off" : nan_to_none(r['OFF STREET NAME']),
            "on" : nan_to_none(r['ON STREET NAME']),
            "cross" : nan_to_none(r['CROSS STREET NAME']),
        }
    }
    
    for i in range(1, 6):
        v = {
            "type" : r["VEHICLE TYPE CODE %s" % i],
            "factor" : r["CONTRIBUTING FACTOR VEHICLE %s" % i],
        }
        
        if nan_to_none(v["type"]) or nan_to_none(v["factor"]):
            res["vehicles"].append(v)
            
    return res

In [8]:
import dicttoxml
import json

In [9]:
!mkdir source_data
!mkdir source_data/2014
!mkdir source_data/2015
!mkdir source_data/2016
!mkdir source_data/2017

mkdir: source_data: File exists
mkdir: source_data/2014: File exists
mkdir: source_data/2015: File exists
mkdir: source_data/2016: File exists
mkdir: source_data/2017: File exists


In [10]:
year_groups = dfs.groupby('YEAR')

In [12]:
year_groups.size()

YEAR
2012    20145
2013    40268
2014    37692
2015    38026
2016    43105
2017    37326
dtype: int64

In [14]:
dfs.to_sql?

In [57]:
year_groups.get_group(2015)[df_raw.columns].to_csv("source_data/2015/nypd_motor_vehicle.csv", index=False)

In [64]:
for (year, month), data in dfs.query("YEAR == 2016").groupby(["YEAR", "MONTH"]):
    rows = [to_rec(r) for _, r in data.iterrows()]
    json.dump(rows, open("source_data/2016/nypd_motor_vehicle.%04i.%02i.json" % (year, month), "w"), indent=2)

In [72]:
for (year, month), data in dfs.query("YEAR == 2017").groupby(["YEAR", "MONTH"]):
    rows = [to_rec(r) for _, r in data.iterrows()]
    with open("source_data/2017/nypd_motor_vehicle.%04i.%02i.xml" % (year, month), "w") as o:
        xml = dicttoxml.dicttoxml(rows, custom_root="motor_vehicle_incidents")
        o.write(xml.decode())

In [18]:
collisions = dfs

In [16]:
%load_ext sql

In [21]:
%sql sqlite:///nypd_collisions.db

'Connected: None@nypd_collisions.db'

In [22]:
%sql PERSIST collisions

'Persisted collisions'