In [1]:
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

import hdfs3 as hdfs
import os
import networkx as nx
import osmnx as ox

from tqdm import tqdm

from shapely.geometry import Point, LineString

%matplotlib inline
ox.config(use_cache=True, log_console=False)

In [2]:
hdfsFS = hdfs.HDFileSystem(os.environ["NAMENODE_HOSTNAME"], port=int(os.environ["NAMENODE_PORT"]))

In [3]:
with hdfsFS.open('/user/root/data/incidents_part1_part2.csv') as f:
    data = pd.read_csv(f, low_memory=False)

In [4]:
df = data.drop(['the_geom','the_geom_webmercator','point_x','point_y','objectid'],axis=1)

In [5]:
df['dispatch_date'] = pd.to_datetime(df['dispatch_date'])

In [6]:
df['location_block'] = df['location_block'].str.replace(' S ', ' SOUTH ')
df['location_block'] = df['location_block'].str.replace(' N ', ' NORTH ')
df['location_block'] = df['location_block'].str.replace(' E ', ' EAST ')
df['location_block'] = df['location_block'].str.replace(' W ', ' WEST ')
df['location_block'] = df['location_block'].str.replace('/', ' & ')
df['location_block'] = df['location_block'].str.replace(' ST', ' STREET')
df['location_block'] = df['location_block'].str.replace(' STREETREET', ' STREET')
df['location_block'] = df['location_block'].str.replace(' AV', ' AVENUE')
df['location_block'] = df['location_block'].str.replace(' AVENUEE ', ' AVENUE ')
df['location_block'] = df['location_block'].str.replace(' AVENUEE', ' AVENUE')
df['location_block'] = df['location_block'].str.replace(' BLVD', ' BOULEVARD')

In [7]:
df_2015 = df[df.dispatch_date.dt.year == 2015]

In [8]:
df_2015.head()

Unnamed: 0,lng,dc_dist,psa,dispatch_date_time,dispatch_date,dispatch_time,dc_key,location_block,ucr_general,text_general_code,lat,hour_
0,-75.173172,1,1,2015-12-17 07:26:00,2015-12-17,07:26:00,201501056051,1800 BLOCK SOUTH CHADWICK STREET,1400.0,Vandalism/Criminal Mischief,39.928303,7
1,-75.172603,1,1,2015-12-17 11:04:00,2015-12-17,11:04:00,201501056080,1500 BLOCK MC KEAN STREET,1400.0,Vandalism/Criminal Mischief,39.925975,11
2,-75.175081,1,1,2015-12-18 22:55:00,2015-12-18,22:55:00,201501056360,2200 BLOCK SOUTH 17TH STREET,1400.0,Vandalism/Criminal Mischief,39.922675,22
3,-75.174745,1,1,2015-12-19 00:54:00,2015-12-19,00:54:00,201501056375,2200 BLOCK SOUTH 17TH STREET,1400.0,Vandalism/Criminal Mischief,39.923722,0
4,-75.172683,1,1,2015-12-20 01:07:00,2015-12-20,01:07:00,201501056502,S 16TH STREET & MC KEAN STREET,1400.0,Vandalism/Criminal Mischief,39.926027,1


In [9]:
# # get a graph for some city
G = ox.graph_from_place('Philadelphia, Pennsylvania, USA', network_type='drive')

In [None]:
fig, ax = ox.plot_graph(G,fig_height=20,fig_width=20)

In [10]:
edges_data = nx.to_pandas_edgelist(G)[['name','source','target','geometry']].dropna(subset=['geometry'])

In [11]:
crimesloc_2015 = df_2015[['lng','lat','hour_','location_block']].copy()

In [12]:
crimesloc_2015.reset_index(drop=True,inplace=True)

In [13]:
crimesloc_2015.head()

Unnamed: 0,lng,lat,hour_,location_block
0,-75.173172,39.928303,7,1800 BLOCK SOUTH CHADWICK STREET
1,-75.172603,39.925975,11,1500 BLOCK MC KEAN STREET
2,-75.175081,39.922675,22,2200 BLOCK SOUTH 17TH STREET
3,-75.174745,39.923722,0,2200 BLOCK SOUTH 17TH STREET
4,-75.172683,39.926027,1,S 16TH STREET & MC KEAN STREET


In [14]:
len(crimesloc_2015)

183300

In [79]:
correction = 0

In [76]:
crimesloc_2015.iloc[correction].location_block

'PIA'

In [77]:
crimesloc_2015.at[correction,'location_block'] = 'PIA WAY'
crimesloc_2015.iloc[correction].location_block

'PIA WAY'

In [22]:
def closest_d(df1,df2):
    cols = ['d','name','source','target','geometry']
    clo_d = pd.DataFrame(columns=cols)
    
    with tqdm(total=len(list(df1.loc[correction:].iterrows())),unit=' points') as pbar:
        for index,row in df1.loc[correction:].iterrows():
            distances = []
            s = row['location_block'].split(' ')[-2:]
            df3 = df2[(df2['name'].str.get(0).str.contains('' + s[0] + ' ' + s[1],case=False,na=True)) | \
                    (df2['name'].str.get(1).str.contains('' + s[0] + ' ' + s[1],case=False,na=True))]
            for i,r in df3.iterrows():
                distances.append([r['geometry'].distance(Point(row['lng'],row['lat'])),\
                                r['name'],r['source'],r['target'],r['geometry']])
            d_df = pd.DataFrame(distances,columns=cols)
            clo_d = clo_d.append(d_df.loc[d_df['d'].idxmin()],ignore_index=True)
            pbar.update(1)
        return clo_d

In [80]:
c_d = closest_d(crimesloc_2015,edges_data)

  if __name__ == '__main__':
100%|██████████| 183300/183300 [9:41:44<00:00,  5.10 points/s]  


In [81]:
c_d.head()

Unnamed: 0,d,name,source,target,geometry
0,0.014029,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
1,0.011676,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
2,0.008827,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
3,0.009735,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
4,0.011731,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"


In [82]:
c_d.to_csv('crimesloc_2015.csv')

In [83]:
len(c_d)

183300

## Use groupby that will do a count of crimes for each street which we can then add as an edge attribute to our network.

In [84]:
results_2015 = pd.concat([crimesloc_2015, c_d], axis=1,ignore_index=True)

In [85]:
results_2015.columns = ['lng','lat','hour_','location_block','d','name','source','target','geometry']
results_2015.head(5)

Unnamed: 0,lng,lat,hour_,location_block,d,name,source,target,geometry
0,-75.173172,39.928303,7,1800 BLOCK SOUTH CHADWICK STREET,0.014029,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
1,-75.172603,39.925975,11,1500 BLOCK MC KEAN STREET,0.011676,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
2,-75.175081,39.922675,22,2200 BLOCK SOUTH 17TH STREET,0.008827,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
3,-75.174745,39.923722,0,2200 BLOCK SOUTH 17TH STREET,0.009735,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"
4,-75.172683,39.926027,1,S 16TH STREET & MC KEAN STREET,0.011731,,2705005307,109756384,"LINESTRING (-75.1722845 39.9143027, -75.17244599999999 39.913488, -75.172487 39.913269, -75.17251589999999 39.9131479, -75.17247500000001 39.9130269)"


In [86]:
results_2015_2 = results_2015.groupby(['source','target'])
r2015_grouped = results_2015_2.location_block.count().reset_index()
r2015_grouped.columns=['source','target','crime_count_2015']
r2015_grouped.head()

Unnamed: 0,source,target,crime_count_2015
0,109727728,110216264,105
1,109727790,109881405,89
2,109727799,109816659,20
3,109727859,110250413,1
4,109729590,109729330,2


In [87]:
len(r2015_grouped)

1206

In [88]:
full_edges_data = nx.to_pandas_edgelist(G)
full_edges_data.head(2)

Unnamed: 0,access,bridge,geometry,highway,junction,lanes,length,maxspeed,name,oneway,osmid,ref,service,source,target,tunnel
0,,,,residential,,,15.587,,South 24th Street,False,12189760,,,109903872,469948693,
1,,,"LINESTRING (-75.18837379999999 39.9196918, -75.18812269999999 39.9194224, -75.1879605 39.9192863, -75.187809 39.9191725, -75.1876494 39.9190694, -75.1874632 39.9189685, -75.18728179999999 39.9188885, -75.1871017 39.9188266, -75.1870328 39.9188061, -75.1869623 39.9187851, -75.1868419 39.9187507, -75.18668460000001 39.9187154, -75.1866031 39.9187069, -75.1860543 39.91865)",primary,,,239.255,,Oregon Avenue,True,"[96161681, 423969707]",,,109903872,2124308758,


In [89]:
len(full_edges_data)

61859

In [90]:
full_merged = pd.merge(left=full_edges_data,right=r2015_grouped,how='left',\
                       left_on=['source','target'],right_on=['source','target'])
full_merged['crime_count_2015'].fillna(0, inplace=True)

In [91]:
full_merged.head(2)

Unnamed: 0,access,bridge,geometry,highway,junction,lanes,length,maxspeed,name,oneway,osmid,ref,service,source,target,tunnel,crime_count_2015
0,,,,residential,,,15.587,,South 24th Street,False,12189760,,,109903872,469948693,,0.0
1,,,"LINESTRING (-75.18837379999999 39.9196918, -75.18812269999999 39.9194224, -75.1879605 39.9192863, -75.187809 39.9191725, -75.1876494 39.9190694, -75.1874632 39.9189685, -75.18728179999999 39.9188885, -75.1871017 39.9188266, -75.1870328 39.9188061, -75.1869623 39.9187851, -75.1868419 39.9187507, -75.18668460000001 39.9187154, -75.1866031 39.9187069, -75.1860543 39.91865)",primary,,,239.255,,Oregon Avenue,True,"[96161681, 423969707]",,,109903872,2124308758,,0.0


In [92]:
full_merged.to_csv('full_edges_2015.csv')

In [93]:
len(full_merged)

61859

In [94]:
full_merged_nx = nx.from_pandas_edgelist(full_merged,'source','target',edge_attr=True)

for n in full_merged_nx.edges.items():
    for k,v in n[1].copy().items():
        if v != v:
            n[1].pop(k)

In [95]:
full_merged_nx.edges.data()

EdgeDataView([(109903872, 469948693, {'highway': 'residential', 'length': 15.587, 'name': 'South 24th Street', 'oneway': False, 'osmid': 12189760, 'crime_count_2015': 0.0}), (109903872, 2124308758, {'geometry': <shapely.geometry.linestring.LineString object at 0x7fcd0dae36a0>, 'highway': 'primary', 'length': 239.255, 'name': 'Oregon Avenue', 'oneway': True, 'osmid': [96161681, 423969707], 'crime_count_2015': 0.0}), (109903872, 2888405205, {'geometry': <shapely.geometry.linestring.LineString object at 0x7fcd0d3c9fd0>, 'highway': 'primary', 'length': 273.331, 'name': ['Oregon Avenue/Vare Avenue', 'West Oregon Vare Avenue'], 'oneway': True, 'osmid': [39228052, 423969693], 'crime_count_2015': 98.0}), (469948693, 110453015, {'geometry': <shapely.geometry.linestring.LineString object at 0x7fcd0dcd85c0>, 'highway': 'residential', 'length': 89.441, 'name': 'South 24th Street', 'oneway': False, 'osmid': 196393963, 'crime_count_2015': 0.0}), (469948693, 2124308764, {'geometry': <shapely.geometry