# Cleaning Greenburgh, NY Police Data Sets

In [31]:
import geocoder
import numpy as np
import pandas as pd
from pyproj import Proj, transform

In [20]:
filename_accidents = '~/git/greenburgh-town-supervisor-2019/src/data/police/vehicle_accidents_2018_2019.csv'
filename_larceny = '~/git/greenburgh-town-supervisor-2019/src/data/police/vehicle_larceny_2018_2019.csv'
output_filename_larceny = '~/git/greenburgh-town-supervisor-2019/output/police/vehicle_larceny_2018_2019_cleaned.csv'
output_filename_accidents = '~/git/greenburgh-town-supervisor-2019/output/police/vehicle_accidents_2018_2019_cleaned.csv'

In [21]:
vehicle_accidents = pd.read_csv(filename_accidents)
vehicle_larceny = pd.read_csv(filename_larceny)

In [22]:
def convert_scpc_lat_long(x, y, inproj_epsg=2260, outproj_epsg=4326):
    """
    Converting SCPC to Latitude/Longitude
    
    Greenburgh, NY is EPSG 2260: (NY East: https://www.dot.ny.gov/divisions/engineering/design/design-services/land-survey/repository/Chapter%206%20NYSDOT%20Coordinate%20Systems%20and%20Datums.pdf)
    Lookup EPSG Code here: https://spatialreference.org/ref/epsg/2260/
    """
    inProj = Proj(init='epsg:{}'.format(inproj_epsg), preserve_units=True)
    outProj = Proj(init='epsg:{}'.format(outproj_epsg))
    longitude, latitude = transform(inProj, outProj, x, y)
    return (latitude, longitude)

In [23]:
def geocode(address):
    """
    Geocode using Open Street Maps taking in Address as an argument
    """
    print(address)
    g = geocoder.osm(address)
    if g.status == 'ERROR - No results found':
        return None
    else:
        return (g.json['lat'], g.json['lng'])

## Cleaning Vehicle Larceny Data

In [24]:
vehicle_larceny['spcs_x_converted'] = vehicle_larceny.spcs_x / 100.00
vehicle_larceny['spcs_y_converted'] = vehicle_larceny.spcs_y / 100.00

In [25]:
vehicle_larceny.head()

Unnamed: 0,case_number,report_date,offense,location,status,status_time,officer,spcs_x,spcs_y,spcs_x_converted,spcs_y_converted
0,2018000093,1/3/18 16:14,GRAND LARCENY-4TH,480 WHITE PLAINS RD,INVESTIGATION,1/3/18,"BURNETT, G. A.",67314962,81294812,673149.62,812948.12
1,2018000445,1/13/18 0:08,PETIT LARCENY,777 OLD SAW MILL RIVER,INVESTIGATION,1/13/18,"MARZELLA, A.",67965338,81811000,679653.38,818110.0
2,2018000545,1/16/18 14:39,PETIT LARCENY,610 WHITE PLAINS RD,INVESTIGATION,1/16/18,"NECZESNY, M. M.",67546606,81231181,675466.06,812311.81
3,2018000926,1/27/18 12:22,PETIT LARCENY,9 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68660119,79619500,686601.19,796195.0
4,2018000936,1/27/18 14:36,PETIT LARCENY,30 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68676681,79645225,686766.81,796452.25


In [26]:
vehicle_larceny['lat_long'] = vehicle_larceny.apply(lambda x: convert_scpc_lat_long(
    x.spcs_x_converted, x.spcs_y_converted), axis=1)
vehicle_larceny[['latitude', 'longitude']] = pd.DataFrame(
    vehicle_larceny['lat_long'].tolist(), index=vehicle_larceny.index) 

In [27]:
vehicle_larceny.loc[vehicle_larceny.offense.str.contains('Grand', case=False), 'offense_type'] = 'Grand Larceny'
vehicle_larceny.loc[vehicle_larceny.offense.str.contains('Petit', case=False), 'offense_type'] = 'Petit Larceny'
vehicle_larceny.rename(columns={'status_time': 'status_date'}, inplace=True)

In [28]:
vehicle_larceny['street'] = vehicle_larceny['location'].str.replace('\d+', '').str.strip()

In [53]:
random_noise_lat = np.random.choice(a=[-0.0002, -0.0001, 0.0001, 0.0002], size=vehicle_larceny.shape[0])
random_noise_long = np.random.choice(a=[-0.0002, -0.0001, 0.0001, 0.0002], size=vehicle_larceny.shape[0])
vehicle_larceny['Latitude (Randomized)'] = vehicle_larceny.latitude + random_noise_lat
vehicle_larceny['Longitude (Randomized)'] = vehicle_larceny.longitude + random_noise_long

In [56]:
# Pruning final dataset
vehicle_larceny_final = vehicle_larceny[[
    'case_number',
    'report_date',
    'offense_type',
    'street',
    'status',
    'status_date',
    'officer',
    'Latitude (Randomized)',
    'Longitude (Randomized)',
]]

vehicle_larceny_final.rename(columns={
    'case_number': 'Case Number',
    'report_date': 'Report Date',
    'offense_type': 'Offense Type',
    'street': 'Street',
    'status': 'Status',
    'status_date': 'Status Date',
    'officer': 'Officer'}, inplace=True)

vehicle_larceny_final.head()

Unnamed: 0,Case Number,Report Date,Offense Type,Street,Status,Status Date,Officer,Latitude (Randomized),Longitude (Randomized)
0,2018000093,1/3/18 16:14,Grand Larceny,WHITE PLAINS RD,INVESTIGATION,1/3/18,"BURNETT, G. A.",41.063125,-73.843401
1,2018000445,1/13/18 0:08,Petit Larceny,OLD SAW MILL RIVER,INVESTIGATION,1/13/18,"MARZELLA, A.",41.077156,-73.81957
2,2018000545,1/16/18 14:39,Petit Larceny,WHITE PLAINS RD,INVESTIGATION,1/16/18,"NECZESNY, M. M.",41.061731,-73.835018
3,2018000926,1/27/18 12:22,Petit Larceny,HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",41.017255,-73.795312
4,2018000936,1/27/18 14:36,Petit Larceny,HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",41.017658,-73.794804


In [57]:
vehicle_larceny_final.to_csv(output_filename_larceny, index=False)

### Creating de-identified Vehicle Larceny Dataset

In [12]:
vehicle_larceny.head()

Unnamed: 0,case_number,report_date,offense,location,status,status_time,officer,spcs_x,spcs_y
0,2018000093,1/3/18 16:14,GRAND LARCENY-4TH,480 WHITE PLAINS RD,INVESTIGATION,1/3/18,"BURNETT, G. A.",67314962,81294812
1,2018000445,1/13/18 0:08,PETIT LARCENY,777 OLD SAW MILL RIVER,INVESTIGATION,1/13/18,"MARZELLA, A.",67965338,81811000
2,2018000545,1/16/18 14:39,PETIT LARCENY,610 WHITE PLAINS RD,INVESTIGATION,1/16/18,"NECZESNY, M. M.",67546606,81231181
3,2018000926,1/27/18 12:22,PETIT LARCENY,9 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68660119,79619500
4,2018000936,1/27/18 14:36,PETIT LARCENY,30 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68676681,79645225


In [15]:
vehicle_larceny['street'] = vehicle_larceny['location'].str.replace('\d+', '').str.strip()

In [16]:
vehicle_larceny.head()

Unnamed: 0,case_number,report_date,offense,location,status,status_time,officer,spcs_x,spcs_y,street
0,2018000093,1/3/18 16:14,GRAND LARCENY-4TH,480 WHITE PLAINS RD,INVESTIGATION,1/3/18,"BURNETT, G. A.",67314962,81294812,WHITE PLAINS RD
1,2018000445,1/13/18 0:08,PETIT LARCENY,777 OLD SAW MILL RIVER,INVESTIGATION,1/13/18,"MARZELLA, A.",67965338,81811000,OLD SAW MILL RIVER
2,2018000545,1/16/18 14:39,PETIT LARCENY,610 WHITE PLAINS RD,INVESTIGATION,1/16/18,"NECZESNY, M. M.",67546606,81231181,WHITE PLAINS RD
3,2018000926,1/27/18 12:22,PETIT LARCENY,9 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68660119,79619500,HOLLAND PL
4,2018000936,1/27/18 14:36,PETIT LARCENY,30 HOLLAND PL,INVESTIGATION,1/27/18,"GARDNER, K. T.",68676681,79645225,HOLLAND PL


## Cleaning Vehicle Accident Data

In [7]:
def process_vehicle_accident_data(vehicle_accidents_df, vehicle_accidents_geocoded_df=None):
    """
    Process Vehicle Accident Data
    
    If given a data frame that is already geocoded, check against it to make sure we're not duplicating 
    reports that geocode (since there is rate limiting)
    """
    write_header = True
    if vehicle_accidents_geocoded_df is not None:
        vehicle_accidents_df = vehicle_accidents.loc[~vehicle_accidents.report_number.isin(vehicle_accidents_cleaned.report_number)]
        write_header = False
    
    vehicle_accidents_df.loc[pd.isnull(vehicle_accidents_df.intersection), 'address'] = vehicle_accidents_df.roadway + ', Greenburgh, NY'
    vehicle_accidents_df.loc[pd.notnull(vehicle_accidents_df.intersection), 'address'] = \
        vehicle_accidents_df.roadway + ' AND ' + vehicle_accidents_df.intersection + ', Greenburgh, NY'
    
    # Manual address settings based on email sent by Lucas on 2019-08-05 at 6:27 PM 
    vehicle_accidents_df.loc[vehicle_accidents_df.roadway == 'HIGHRIDGE RD', 'address'] = 'HIGHRIDGE RD, Hartsdale, NY'
    vehicle_accidents_df.loc[vehicle_accidents_df.roadway == 'CHESTNUT ST', 'address'] = 'CHESTNUT ST, Ardsley, NY'
    vehicle_accidents_df.loc[vehicle_accidents_df.roadway == '125 N WASHINGTON AVE', 'address'] = '125 N WASHINGTON AVE, Hartsdale, NY'
    
    reports = []
    for index, row in vehicle_accidents_df.iterrows():
        report = row.to_frame().transpose()
        result = geocode(row['address'])
        if result:
            lat, long = result
            report['latitude'] = lat
            report['longitude'] = long
            reports.append(report)
            final_output = pd.concat(reports)
            final_output.to_csv(output_filename_accidents, mode='a', index=False, header=write_header)
    
    return final_output
    

In [8]:
vehicle_accidents_cleaned = pd.read_csv(output_filename_accidents)

In [9]:
vehicle_accidents_cleaned.head()

Unnamed: 0,date,time,date_time,roadway,intersection,report_number,address,latitude,longitude
0,1/4/18,855,1/4/18 8:55,HILLCREST RD,,2018000119,"HILLCREST RD, Greenburgh, NY",41.021352,-73.802
1,1/4/18,2038,1/4/18 20:38,25 JEAN LN,,2018000146,"25 JEAN LN, Greenburgh, NY",41.028349,-73.825272
2,1/7/18,1432,1/7/18 14:32,280 SECOR RD,,2018000227,"280 SECOR RD, Greenburgh, NY",41.026762,-73.827426
3,1/10/18,804,1/10/18 8:04,UNDERHILL RD,,2018000321,"UNDERHILL RD, Greenburgh, NY",41.001765,-73.814421
4,1/10/18,1012,1/10/18 10:12,UNDERHILL RD,,2018000327,"UNDERHILL RD, Greenburgh, NY",41.001765,-73.814421


In [10]:
vehicle_accidents.head()

Unnamed: 0,date,time,date_time,roadway,intersection,report_number
0,1/4/18,855,1/4/18 8:55,HILLCREST RD,,2018000119
1,1/4/18,2038,1/4/18 20:38,25 JEAN LN,,2018000146
2,1/5/18,1711,1/5/18 17:11,PAYNE ST,N HIGH ST,2018000168
3,1/7/18,1432,1/7/18 14:32,280 SECOR RD,,2018000227
4,1/10/18,804,1/10/18 8:04,UNDERHILL RD,,2018000321


In [11]:
final_output = pd.concat([
    vehicle_accidents_cleaned,
    process_vehicle_accident_data(vehicle_accidents, vehicle_accidents_cleaned)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


PAYNE ST AND N HIGH ST, Greenburgh, NY
SAW MILL RIVER RD AND 410 SAW MILL RIVER, Greenburgh, NY
TARRYTOWN RD AND ROSEMONT BLVD, Greenburgh, NY
GRASSLANDS RD AND CLEARBROOK RD, Greenburgh, NY
S CENTRAL AV AND MT JOY AV, Greenburgh, NY
JACKSON AV AND SPRAIN RD, Greenburgh, NY
OLD JACKSON AV AND SPRAIN VALLEY RD, Greenburgh, NY
W HARTSDALE AV AND SECOR RD, Greenburgh, NY
TARRYTOWN RD AND MANHATTAN AV, Greenburgh, NY
ARDSLEY RD AND EDGEMONT RD, Greenburgh, NY
JACKSON AV AND EDITHS WAY, Greenburgh, NY
TARRYTOWN RD AND FAIR ST, Greenburgh, NY
TAXTER RD AND PINE LN, Greenburgh, NY
SECOR RD AND DUNHAM RD, Greenburgh, NY
7 HOLLAND PL AND CATERSON TER, Greenburgh, NY
HIGHRIDGE RD, Hartsdale, NY
N WASHINGTON AV AND LAUREL ST, Greenburgh, NY
WOODHAMPTON DR, Greenburgh, NY
ARDSLEY RD AND FORT HILL RD, Greenburgh, NY
COLUMBIA AV AND E HARTSDALE AV, Greenburgh, NY
HILLSIDE AV AND MCLEAN AV, Greenburgh, NY
W HARTSDALE AV AND PINEWOOD RD, Greenburgh, NY
SAW MILL RIVER RD AND BEAVER HILL RD, Greenburgh,

ARDSLEY RD AND LYNWOOD RD, Greenburgh, NY
ARDSLEY RD AND LYNWOOD RD, Greenburgh, NY
ARDSLEY RD AND LYNWOOD RD, Greenburgh, NY
SPRAIN RD AND JACKSON AV, Greenburgh, NY
N. CENTRAL AV AND N. WASHINGTON AV, Greenburgh, NY
JACKSON AVE AND ANDREWS WAY, Greenburgh, NY
JACKSON AVE AND GRASSY SPRAIN RD, Greenburgh, NY
E. HARTSDALE AVE AND CLUBWAY, Greenburgh, NY
CENTRAL AVE AND UNDERHILL RD, Greenburgh, NY
S. CENTRAL AVE AND ARDSLEY RD, Greenburgh, NY
HUNTER LN AND SAW MILL RIVER RD, Greenburgh, NY
JACKSON AVE AND SPRAIN RD, Greenburgh, NY
JACKSON AVE AND GRASSY SPRAIN RD, Greenburgh, NY
CENTRAL AVE AND N.WASHINGTON AVE, Greenburgh, NY
CENTRAL AVE AND MT. JOY AVE, Greenburgh, NY
JACKSON AVE AND GRASSY SPRAIN RD, Greenburgh, NY
JACKSON AVE AND GRASSY SPRAIN RD, Greenburgh, NY
HILLSIDE AVE AND MCLEAN AVE, Greenburgh, NY
SAW MILL RIVER RD AND SAW MILL RIVER RD, Greenburgh, NY
TARRYTOWN RD AND DOBBS FERRY RD, Greenburgh, NY
VIRGINIA RD AND LAWRENCE DR, Greenburgh, NY
KNOLLWOOD RD AND GRASSLANDS RD,

WHITE PLAINS RD AND BENEDICT AV, Greenburgh, NY
HILLSIDE AV AND I287, Greenburgh, NY
MANHATTAN AV AND FLORENCE AV, Greenburgh, NY
KNOLLWOOD RD AND OLD TARRYTOWN RD, Greenburgh, NY
PAYNE ST AND WINTHROP AV, Greenburgh, NY
N CENTRAL AV AND ALEXANDER AV, Greenburgh, NY
DOBBS FERRY RD AND OLD RD, Greenburgh, NY
SAW MILL RIVER RD AND BEAVER HILL RD, Greenburgh, NY
JACKSON AV AND SPRAIN RD, Greenburgh, NY
DOBBS FERRY RD AND SPRAIN BROOK PKW, Greenburgh, NY
SAW MILL RIVER RD AND LAMONT ST, Greenburgh, NY
KNOLLWOOD RD AND STADIUM RD, Greenburgh, NY
E HARTSDALE AV AND CATERSON TER, Greenburgh, NY
TARRYTOWN RD AND WYOMING AV, Greenburgh, NY
TARRYTOWN RD AND ROSEMONT BLVD, Greenburgh, NY
DOBBS FERRY RD AND TERRACE ST, Greenburgh, NY
S CENTRAL AV AND MT JOY AV, Greenburgh, NY
SAW MILL RIVER RD AND N PAYNE ST, Greenburgh, NY
KNOLLWOOD RD AND STADIUM RD, Greenburgh, NY
N CENTRAL AVE AND W HARTSDALE AVE, Greenburgh, NY
N CENTRAL AVE AND N WASHINGTON AVE, Greenburgh, NY
DOBBS FERRY RD AND HARTSDALE RD

N CENTRAL AV AND N WASHINGTON AV, Greenburgh, NY
ARDSLEY RD AND WESTMINSTER RD, Greenburgh, NY
SAW MILL RIVER RD AND DONALD DR, Greenburgh, NY
ARDSLEY RD AND SPRAIN RD, Greenburgh, NY
PARK AV W AND WOODLANDS AV N, Greenburgh, NY
HILLSIDE AVE AND OLD TARRYTOWN RD, Greenburgh, NY
UNDERHILL RD AND FORTHILL RD, Greenburgh, NY
JACKSON AVE AND GRASSY SPRAIN RD, Greenburgh, NY
W HARTSDALE AVE AND S WASHINGTON AVE, Greenburgh, NY
KNOLLWOOD RD AND TARYTOWN RD, Greenburgh, NY
WHITE PLAINS RD AND I87, Greenburgh, NY
37 TARRYTOWN RD PKG L, Greenburgh, NY
RIDGE RD AND HILLCREST RD, Greenburgh, NY
ARDSLEY RD AND OLD ARMY RD, Greenburgh, NY
SPRAIN RD AND BOULDER RIDGE RD, Greenburgh, NY
W HARTSDALE AV AND RIDGE RD, Greenburgh, NY
1-BLK SPRAIN RD, Greenburgh, NY
EDGEWOOD RD AND SECOR RD, Greenburgh, NY
JACKSON AV AND SPRAIN BROOK PKW, Greenburgh, NY
1134 DOBBS FERRY RD AND FORREST BLVD, Greenburgh, NY
OLD TARRYTOWN RD AND MAPLE ST, Greenburgh, NY
353 COUNTY CENTER RD AND WINNETOU RD, Greenburgh, NY
CH

OLD SAW MILL RIVER RD AND ENTRANCE TO 710 O, Greenburgh, NY
SOUTH RD AND MADISON PL, Greenburgh, NY
500 WHITE PLAINS RD AND PARKING LOT 500 W, Greenburgh, NY
TARRYTOWN RD AND FAIR STREET, Greenburgh, NY
TARRYTOWN RD AND FAIR STREET, Greenburgh, NY
E HARTSDALE AVE AND LEWIS HILL RD, Greenburgh, NY
JACKSON AVE AND OLD JACKSON AVE, Greenburgh, NY
SAW MILL RIVER RD AND FIELDCREST DR, Greenburgh, NY
BENEDICT AVE AND WHITE PLAINS RD, Greenburgh, NY
300 EXECUTIVE BLVD AND CLEARBROOK RD, Greenburgh, NY
SPRAIN RD AND HEATHERDELL RD, Greenburgh, NY
W HARTSDALE AVE AND S CENTRAL AVE, Greenburgh, NY
FORTHILL RD AND HIGHPOINT RD, Greenburgh, NY
TARRYTOWN RD/ 287 RA AND 287 EXIT 5 RAMP, Greenburgh, NY
TARRYTOWN RD AND WYOMING AVE, Greenburgh, NY
3 GREENWOOD LANE AND CANTERBURY RD., Greenburgh, NY
FORT HILL RD AND JACKSON AVE, Greenburgh, NY
PAYNE ST AND SEARS AVE, Greenburgh, NY
JACKSON AVE AND SAW MILL RIVER RD, Greenburgh, NY
GRASSLANDS RD(EB) AND TACONIC PKWY OFF, Greenburgh, NY
KNOLLWOOD RD AND 

JACKSON AVE AND SPRAIN PKWY N/B EX, Greenburgh, NY
E HARTSDALE AVE AND WILSON ST, Greenburgh, NY
W HARTSDALE AVE AND HILLCREST RD, Greenburgh, NY
DOBBS FERRY RD AND HARTSDALE RD, Greenburgh, NY
SAW MILL RIVER RD AND DRIVEWAY TO 425 S, Greenburgh, NY
W HARTSDALE AVE AND N WASHINGTON AVE, Greenburgh, NY
SPRAIN RD AND BOULDER RIDGE RD, Greenburgh, NY
TARRYTOWN RD AND MANHATTAN AVE, Greenburgh, NY
KNOLLWOOD RD AND OLD TARRYTOWN RD, Greenburgh, NY
SAW MILL RIVER RD AND BEAVER HILL RD, Greenburgh, NY
FORT HILL RD AND HIGH POINT RD, Greenburgh, NY
KNOLLOWOOD RD, Greenburgh, NY
GRASSLANDS RD AND SPRAIN BROOK PKW, Greenburgh, NY
TARRYTOWN RD AND AQUEDUCT RD, Greenburgh, NY
WORTHINGTON RD AND FINMOR DR, Greenburgh, NY
TARRYTOWN RD AND AQUEDUCT RD, Greenburgh, NY
KNOLLWOOD RD AND TARRYTOWN RD., Greenburgh, NY
245 S CENTRAL AVE AND MARION AVE, Greenburgh, NY
CATERSON TERRACE AND FINDLAY AVE, Greenburgh, NY
CENTRAL PARK AV AND HARVARD DR, Greenburgh, NY
