In [68]:
# Allow us to load `open_cp` without installing
import sys, os.path
sys.path.insert(0, os.path.abspath(".."))

# Resources for address data

The [TIGER/Line](https://www.census.gov/geo/maps-data/data/tiger-line.html) dataset from the US Census bureau provides a great resource for getting block-level address data.
- [Technical documentation](https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2016/TGRSHP2016_TechDoc.pdf)
- It is easiest to download the data from the [FTP Site](ftp://ftp2.census.gov/geo/tiger/TIGER2016/ADDRFEAT/)
- Here I link to the ADDRFEAT datasets which provide the best geocoding data, according to the tech documentation.
- The files are named using the [FIP County Code](https://en.wikipedia.org/wiki/FIPS_county_code).  For chicago this is 17031.
- Once downloaded, unzip, and find a standard shape file.  It is interesting to load this, and an extract of the crime data (e.g. by loading into a GeoDataFrame and saving), into QGIS and visualising the result.

It is also possible to extract address details from OpenStreetMap.  In particular, for Chicago, there are a very large number of individual buildings, with addresses, available.  However, I have also noticed that the OSM data is not as complete as the US Census dataset (for example, minor roads missing names, even when those roads appear in the crime dataset).

The Census data from 2013 seems good to use.  The 2016 data seems to be more "aggregated": it is easy to find examples of a single block which is split into, say, 3 paths in the geometry.  In the 2013 data each part will contain a range of addresses, while in the 2016 data, only one part contains all the addresses, the other parts having "None" as the address range.

In [33]:
%matplotlib inline
import matplotlib.pyplot as plt
import geopandas as gpd
import os
import numpy as np

filename = os.path.join("/media", "disk", "tl_2013_17031_edges", "tl_2013_17031_edges.shp")

In [2]:
edges = gpd.read_file(filename)

In [157]:
# Slightly curious, if you look it up.  But lon/lat
edges.crs

{'init': 'epsg:4269'}

In [66]:
edges.ix[12]

ARTPATH                                                       N
COUNTYFP                                                    031
DECKEDROAD                                                    N
DIVROAD                                                       N
EXTTYP                                                        N
FEATCAT                                                       S
FULLNAME                                              N Park Dr
GCSEFLG                                                       N
HYDROFLG                                                      N
LFROMADD                                                   None
LTOADD                                                     None
MTFCC                                                     S1400
OFFSETL                                                       N
OFFSETR                                                       N
OLFFLG                                                        N
PASSFLG                                 

We only care about the columns "geometry" and "FULLNAME" (giving the road name) and LFROMADD, LTOADD, RFROMADD, RTOADD

In [158]:
want = {"geometry", "FULLNAME", "LFROMADD", "LTOADD", "RFROMADD", "RTOADD"}
edges = gpd.GeoDataFrame({key:edges[key] for key in want})
edges.head()

Unnamed: 0,FULLNAME,LFROMADD,LTOADD,RFROMADD,RTOADD,geometry
0,,,,,,"LINESTRING (-87.60164199999998 41.845196, -87...."
1,,,,,,LINESTRING (-87.11116199999998 42.149408999999...
2,,,,,,"LINESTRING (-87.62505699999998 42.071528, -87...."
3,,,,,,LINESTRING (-87.74845399999998 42.152859999999...
4,,,,,,"LINESTRING (-87.63278799999999 41.970942, -87...."


In [93]:
import open_cp.sources.chicago as chicago

frame = chicago.load_to_geoDataFrame()
frame = frame[frame.geometry.map(lambda pt : pt is not None)]

# Pick out one block

We'll eventually iterate over each block, but let's just pick one.

In [171]:
one = frame[frame.address == frame.address.unique()[0]].copy()
one.head()

Unnamed: 0,address,case,crime,geometry,location,timestamp,type,name
0,010XX N CENTRAL PARK AVE,HZ560767,OTHER OFFENSE,POINT (-87.71645415899999 41.899712716),APARTMENT,2016-12-22T02:55:00,VIOLATE ORDER OF PROTECTION,N Central Park Ave
21374,010XX N CENTRAL PARK AVE,JA210311,THEFT,POINT (-87.716462854 41.899954256),VEHICLE NON-COMMERCIAL,2017-04-01T07:00:00,$500 AND UNDER,N Central Park Ave
23295,010XX N CENTRAL PARK AVE,JA210493,ASSAULT,POINT (-87.71645193800001 41.899561771),STREET,2017-04-03T08:03:00,AGGRAVATED: HANDGUN,N Central Park Ave
24540,010XX N CENTRAL PARK AVE,HZ342785,BATTERY,POINT (-87.716457119 41.899786826),RESIDENTIAL YARD (FRONT/BACK),2016-07-09T22:40:00,AGGRAVATED: HANDGUN,N Central Park Ave
54066,010XX N CENTRAL PARK AVE,HZ259442,NARCOTICS,POINT (-87.716463039 41.899935047),ALLEY,2016-05-10T20:57:00,POSS: HEROIN(WHITE),N Central Park Ave


# Distances

In [187]:
def find_match_via_distance(point):
    dist = edges.geometry.distance(point)
    return edges.ix[dist.argmin]

def via_distance(one):
    return [ find_match_via_distance(point).name for point in one.geometry ]

one["edge_index"] = via_distance(one)
one.head()

Unnamed: 0,address,case,crime,geometry,location,timestamp,type,name,edge_index,edge_index1
0,010XX N CENTRAL PARK AVE,HZ560767,OTHER OFFENSE,POINT (-87.71645415899999 41.899712716),APARTMENT,2016-12-22T02:55:00,VIOLATE ORDER OF PROTECTION,N Central Park Ave,164302,164302
21374,010XX N CENTRAL PARK AVE,JA210311,THEFT,POINT (-87.716462854 41.899954256),VEHICLE NON-COMMERCIAL,2017-04-01T07:00:00,$500 AND UNDER,N Central Park Ave,164302,164302
23295,010XX N CENTRAL PARK AVE,JA210493,ASSAULT,POINT (-87.71645193800001 41.899561771),STREET,2017-04-03T08:03:00,AGGRAVATED: HANDGUN,N Central Park Ave,164302,164302
24540,010XX N CENTRAL PARK AVE,HZ342785,BATTERY,POINT (-87.716457119 41.899786826),RESIDENTIAL YARD (FRONT/BACK),2016-07-09T22:40:00,AGGRAVATED: HANDGUN,N Central Park Ave,164302,164302
54066,010XX N CENTRAL PARK AVE,HZ259442,NARCOTICS,POINT (-87.716463039 41.899935047),ALLEY,2016-05-10T20:57:00,POSS: HEROIN(WHITE),N Central Park Ave,164302,164302


# Intersections

In [189]:
def find_match_via_intersection(point):
    possibles = edges[edges.geometry.intersects( point.buffer(0.001) )]
    i = possibles.geometry.distance(point).argmin()
    return edges.ix[i]

def via_intersection(one):
    return [ find_match_via_intersection(point).name
        for point in one.geometry ]
    
one["edge_index1"] = via_intersection(one)
one.head()

Unnamed: 0,address,case,crime,geometry,location,timestamp,type,name,edge_index,edge_index1
0,010XX N CENTRAL PARK AVE,HZ560767,OTHER OFFENSE,POINT (-87.71645415899999 41.899712716),APARTMENT,2016-12-22T02:55:00,VIOLATE ORDER OF PROTECTION,N Central Park Ave,164302,164302
21374,010XX N CENTRAL PARK AVE,JA210311,THEFT,POINT (-87.716462854 41.899954256),VEHICLE NON-COMMERCIAL,2017-04-01T07:00:00,$500 AND UNDER,N Central Park Ave,164302,164302
23295,010XX N CENTRAL PARK AVE,JA210493,ASSAULT,POINT (-87.71645193800001 41.899561771),STREET,2017-04-03T08:03:00,AGGRAVATED: HANDGUN,N Central Park Ave,164302,164302
24540,010XX N CENTRAL PARK AVE,HZ342785,BATTERY,POINT (-87.716457119 41.899786826),RESIDENTIAL YARD (FRONT/BACK),2016-07-09T22:40:00,AGGRAVATED: HANDGUN,N Central Park Ave,164302,164302
54066,010XX N CENTRAL PARK AVE,HZ259442,NARCOTICS,POINT (-87.716463039 41.899935047),ALLEY,2016-05-10T20:57:00,POSS: HEROIN(WHITE),N Central Park Ave,164302,164302


In [190]:
np.all(one.edge_index == one.edge_index1)

True

In [191]:
%timeit(via_distance(one))

1 loop, best of 3: 1min 2s per loop


In [192]:
%timeit(via_intersection(one))

1 loop, best of 3: 44.5 s per loop


# Via rtree

Need a bit more testing to be sure, but this seems vastly, vastly (vastly) quicker...

In [193]:
import rtree

In [198]:
gap = 0.001

def gen():
    for i, row in edges.iterrows():
        bds = list(row.geometry.bounds)
        bds = [bds[0]-gap, bds[1]-gap, bds[2]+gap, bds[3]+gap]
        yield i, bds, None
        
idx = rtree.index.Index(gen())

In [210]:
def find_match_via_rtee(point):
    possibles = edges.ix[list(idx.intersection(point.coords[0]))]
    i = possibles.geometry.distance(point).argmin()
    return edges.ix[i]

def via_rtree(one):
    return [ find_match_via_rtee(point).name
        for point in one.geometry ]
    
one["edge_index2"] = via_rtree(one)
one.head()

Unnamed: 0,address,case,crime,geometry,location,timestamp,type,name,edge_index,edge_index1,edge_index2
0,010XX N CENTRAL PARK AVE,HZ560767,OTHER OFFENSE,POINT (-87.71645415899999 41.899712716),APARTMENT,2016-12-22T02:55:00,VIOLATE ORDER OF PROTECTION,N Central Park Ave,164302,164302,164302
21374,010XX N CENTRAL PARK AVE,JA210311,THEFT,POINT (-87.716462854 41.899954256),VEHICLE NON-COMMERCIAL,2017-04-01T07:00:00,$500 AND UNDER,N Central Park Ave,164302,164302,164302
23295,010XX N CENTRAL PARK AVE,JA210493,ASSAULT,POINT (-87.71645193800001 41.899561771),STREET,2017-04-03T08:03:00,AGGRAVATED: HANDGUN,N Central Park Ave,164302,164302,164302
24540,010XX N CENTRAL PARK AVE,HZ342785,BATTERY,POINT (-87.716457119 41.899786826),RESIDENTIAL YARD (FRONT/BACK),2016-07-09T22:40:00,AGGRAVATED: HANDGUN,N Central Park Ave,164302,164302,164302
54066,010XX N CENTRAL PARK AVE,HZ259442,NARCOTICS,POINT (-87.716463039 41.899935047),ALLEY,2016-05-10T20:57:00,POSS: HEROIN(WHITE),N Central Park Ave,164302,164302,164302


In [211]:
all(one.edge_index1 == one.edge_index2)

True

In [212]:
%timeit(via_rtree(one))

10 loops, best of 3: 19.7 ms per loop
