# Cleaning Greenburgh, NY Police Data Sets

In [1]:
import geocoder
import pandas as pd
from pyproj import Proj, transform

In [2]:
filename_accidents = '~/git/greenburgh-town-supervisor-2019/src/data/police/vehicle_accidents_2018_2019.csv'
filename_larceny = '~/git/greenburgh-town-supervisor-2019/src/data/police/vehicle_larceny_2018_2019.csv'
output_filename_larceny = '~/git/greenburgh-town-supervisor-2019/output/police/vehicle_larceny_2018_2019_cleaned.csv'

In [3]:
vehicle_accidents = pd.read_csv(filename_accidents)
vehicle_larceny = pd.read_csv(filename_larceny)

In [4]:
def convert_scpc_lat_long(x, y, inproj_epsg=2260, outproj_epsg=4326):
    """
    Converting SCPC to Latitude/Longitude
    
    Greenburgh, NY is EPSG 2260: (NY East: https://www.dot.ny.gov/divisions/engineering/design/design-services/land-survey/repository/Chapter%206%20NYSDOT%20Coordinate%20Systems%20and%20Datums.pdf)
    Lookup EPSG Code here: https://spatialreference.org/ref/epsg/2260/
    """
    inProj = Proj(init='epsg:{}'.format(inproj_epsg), preserve_units=True)
    outProj = Proj(init='epsg:{}'.format(outproj_epsg))
    longitude, latitude = transform(inProj, outProj, x, y)
    return (latitude, longitude)

In [5]:
def geocode(address):
    """
    Geocode using Open Street Maps taking in Address as an argument
    """
    g = geocoder.osm(address)
    if g.status == 'ERROR - No results found':
        return None
    else:
        return (g.json['lat'], g.json['lng'])

## Cleaning Vehicle Larceny Data

In [6]:
vehicle_larceny['spcs_x_converted'] = vehicle_larceny.spcs_x / 100.00
vehicle_larceny['spcs_y_converted'] = vehicle_larceny.spcs_y / 100.00

In [7]:
vehicle_larceny.head()

Unnamed: 0,case_number,report_date,offense,location,status,status_time,officer,spcs_x,spcs_y,spcs_x_converted,spcs_y_converted
0,2018000093,1/3/18 16:14,GRAND LARCENY-4TH,480 WHITE PLAINS RD,INVESTIGATION,1/3/18,"BURNETT, G. A.",67314962,81294812,673149.62,812948.12
1,2018000445,1/13/18 0:08,PETIT LARCENY,777 OLD SAW MILL RIVER,INVESTIGATION,1/13/18,"MARZELLA, A.",67965338,81811000,679653.38,818110.0
2,2018000545,1/16/18 14:39,PETIT LARCENY,610 WHITE PLAINS RD,INVESTIGATION,1/16/18,"NECZESNY, M. M.",67546606,81231181,675466.06,812311.81
3,2018000926,1/27/18 12:22,PETIT LARCENY,9 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68660119,79619500,686601.19,796195.0
4,2018000936,1/27/18 14:36,PETIT LARCENY,30 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68676681,79645225,686766.81,796452.25


In [8]:
vehicle_larceny['lat_long'] = vehicle_larceny.apply(lambda x: convert_scpc_lat_long(
    x.spcs_x_converted, x.spcs_y_converted), axis=1)
vehicle_larceny[['latitude', 'longitude']] = pd.DataFrame(
    vehicle_larceny['lat_long'].tolist(), index=vehicle_larceny.index) 

In [9]:
vehicle_larceny.loc[vehicle_larceny.offense.str.contains('Grand', case=False), 'offense_type'] = 'Grand Larceny'
vehicle_larceny.loc[vehicle_larceny.offense.str.contains('Petit', case=False), 'offense_type'] = 'Petit Larceny'
vehicle_larceny.rename(columns={'status_time': 'status_date'}, inplace=True)

In [10]:
# Pruning final dataset
vehicle_larceny_final = vehicle_larceny[[
    'case_number',
    'report_date',
    'offense_type',
    'location',
    'status',
    'status_date',
    'officer',
    'latitude',
    'longitude',
]]
vehicle_larceny_final.head()

Unnamed: 0,case_number,report_date,offense_type,location,status,status_date,officer,latitude,longitude
0,2018000093,1/3/18 16:14,Grand Larceny,480 WHITE PLAINS RD,INVESTIGATION,1/3/18,"BURNETT, G. A.",41.063325,-73.843501
1,2018000445,1/13/18 0:08,Petit Larceny,777 OLD SAW MILL RIVER,INVESTIGATION,1/13/18,"MARZELLA, A.",41.077356,-73.81977
2,2018000545,1/16/18 14:39,Petit Larceny,610 WHITE PLAINS RD,INVESTIGATION,1/16/18,"NECZESNY, M. M.",41.061531,-73.835118
3,2018000926,1/27/18 12:22,Petit Larceny,9 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",41.017055,-73.795212
4,2018000936,1/27/18 14:36,Petit Larceny,30 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",41.017758,-73.794604


In [11]:
vehicle_larceny_final.to_csv(output_filename_larceny, index=False)

## Cleaning Vehicle Accident Data

In [12]:
vehicle_accidents.head()

Unnamed: 0,date,time,date_time,roadway,intersection,report_number
0,1/4/18,855,1/4/18 8:55,HILLCREST RD,,2018000119
1,1/4/18,2038,1/4/18 20:38,25 JEAN LN,,2018000146
2,1/5/18,1711,1/5/18 17:11,PAYNE ST,N HIGH ST,2018000168
3,1/7/18,1432,1/7/18 14:32,280 SECOR RD,,2018000227
4,1/10/18,804,1/10/18 8:04,UNDERHILL RD,,2018000321


In [13]:
vehicle_accidents.loc[pd.isnull(vehicle_accidents.intersection), 'address'] = vehicle_accidents.roadway + ', Greenburgh, NY'
vehicle_accidents.loc[pd.notnull(vehicle_accidents.intersection), 'address'] = \
     vehicle_accidents.roadway + ' AND ' + vehicle_accidents.intersection + ', Greenburgh, NY'

In [14]:
vehicle_accidents.head()

Unnamed: 0,date,time,date_time,roadway,intersection,report_number,address
0,1/4/18,855,1/4/18 8:55,HILLCREST RD,,2018000119,"HILLCREST RD, Greenburgh, NY"
1,1/4/18,2038,1/4/18 20:38,25 JEAN LN,,2018000146,"25 JEAN LN, Greenburgh, NY"
2,1/5/18,1711,1/5/18 17:11,PAYNE ST,N HIGH ST,2018000168,"PAYNE ST AND N HIGH ST, Greenburgh, NY"
3,1/7/18,1432,1/7/18 14:32,280 SECOR RD,,2018000227,"280 SECOR RD, Greenburgh, NY"
4,1/10/18,804,1/10/18 8:04,UNDERHILL RD,,2018000321,"UNDERHILL RD, Greenburgh, NY"


In [15]:
vehicle_accidents['lat_long'] = vehicle_accidents.address.apply(geocode)

Status code 429 from https://nominatim.openstreetmap.org/search: ERROR - 429 Client Error: Too Many Requests for url: https://nominatim.openstreetmap.org/search?q=JACKSON+AVE+AND+SPRAIN+BROOK+PKW%2C+Greenburgh%2C+NY&format=jsonv2&addressdetails=1&limit=1


TypeError: 'NoneType' object is not subscriptable

In [None]:
vehicle_accidents.head()