# Toronto Police Service API
- API documentation: https://developers.arcgis.com/rest/services-reference/enterprise/query-feature-service-layer-.htm 
  - While there is a limit to the number of features included in the feature set response, there is no limit to the number of object IDs returned in the ID array response. Clients can exploit this to get all the query conforming object IDs by specifying returnIdsOnly=true and subsequently requesting feature sets for subsets of object IDs.
  - **Maximum number of records per query** = *`200`*
- Open data license https://data.torontopolice.on.ca/pages/licence 
- Open data documentation (pdf saved in /raw_data folder)
    - The location of crime occurrences have been deliberately offset to the nearest road intersection node to protect the privacy of parties involved in the occurrence. All location data must be considered as an approximate location of the occurrence and users are advised not to interpret any of these locations as related to a specific address or individual.

In [4]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
# set column width to 1000
pd.set_option('display.max_colwidth', 100)
import requests
import concurrent.futures
from datetime import datetime
from tqdm import tqdm
import shapely.geometry
import geopandas as gpd

In [5]:
# Projected Coordinate System (PCS): how you project onto a 2D plane
epsg_2d = 26717 
# Toronto Police Service Open Data includes geographic location information provided in
# the projected coordinate system NAD 1927 UTM 17N
# https://epsg.io/26717

# Geographic Coordinate System (GCS): how coordinates are represented in a 3D space
# World Geodetic System 1984 (WGS84)
# https://epsg.io/4326
epsg_3d = 4326 

# Datasets with API explorer
- Crime https://data.torontopolice.on.ca/datasets/TorontoPS::major-crime-indicators-open-data/about
- Traffic Collisions https://data.torontopolice.on.ca/datasets/TorontoPS::traffic-collisions-asr-t-tbl-001/about
- Shootings https://data.torontopolice.on.ca/datasets/TorontoPS::shooting-and-firearm-discharges-open-data/about
- Homicide https://data.torontopolice.on.ca/datasets/TorontoPS::homicides-open-data-asr-rc-tbl-002/about
- Persons in Crisis Calls for Service Attended https://data.torontopolice.on.ca/datasets/TorontoPS::persons-in-crisis-calls-for-service-attended-open-data/about
- Mental Health Act Apprehensions https://data.torontopolice.on.ca/datasets/TorontoPS::mental-health-act-apprehensions-open-data/about

In [19]:
# name of dataset to query from Toronto Police Service API
crimes = 'Major_Crime_Indicators_Open_Data'
traffic_collisions = 'Traffic_Collisions_ASR_T_TBL_001'
shootings = 'Shooting_and_Firearm_Discharges_Open_Data'
homicide = 'Homicides_Open_Data_ASR_RC_TBL_002'
pic_calls = 'Persons_in_Crisis_Calls_for_Service_Attended_Open_Data'
mha_apprehensions = 'Mental_Health_Act_Apprehensions_Open_Data'
neighbourhoods = 'Neighbourhood_Crime_Rates_Open_Data'

In [20]:
def get_objectIDs(data):
    url = f'https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/{data}/FeatureServer/0/query?where=1%3D1&outFields=*&' + \
        'returnIdsOnly=true' + \
        '&outSR=4326&f=json'
    r = requests.get(url)
    json_data = r.json()
    objectIDs = sorted(json_data['objectIds'])
    print(f'number of records in {data}: {len(objectIDs):,.0f}')
    return objectIDs

In [21]:
# each record has a unique objectID which we can use as an identifier
crime_objectIDs = get_objectIDs(crimes)
collisions_objectIDs = get_objectIDs(traffic_collisions)
shootings_objectIDs = get_objectIDs(shootings)
homicide_objectIDs = get_objectIDs(homicide)
pic_calls_objectIDs = get_objectIDs(pic_calls)
mha_apprehensions_objectIDs = get_objectIDs(mha_apprehensions)
neighbourhoods_objectIDs = get_objectIDs(neighbourhoods)

number of records in Major_Crime_Indicators_Open_Data: 323,296
number of records in Traffic_Collisions_ASR_T_TBL_001: 553,780
number of records in Shooting_and_Firearm_Discharges_Open_Data: 5,707
number of records in Homicides_Open_Data_ASR_RC_TBL_002: 1,322
number of records in Persons_in_Crisis_Calls_for_Service_Attended_Open_Data: 259,950
number of records in Mental_Health_Act_Apprehensions_Open_Data: 93,945
number of records in Neighbourhood_Crime_Rates_Open_Data: 158


In [22]:
# function to request data from API given the object ID
def parallel_request(objectIDs, data, columns = ['*'], chunk_size = 200):
    columns = ','.join(columns)
    output = None
    # splitting the objectIDs into chunks of 200
    objectIDs_chunks = [objectIDs[i:i+chunk_size] for i in range(0,len(objectIDs),chunk_size)]
    
    # function to get data for each objectID chunk
    def get_data(id_list, data, columns):
        hdr = {'User-Agent': "Mozilla/5.0"}
        bucket = ','.join(map(str, id_list))
        url = 'https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/' + \
            f'{data}/' + \
            'FeatureServer/0/query?objectIds=' + bucket + \
            f'&outFields={columns}' + \
            '&outSR=4326&f=json&returnExceededLimitFeatures=true'
        r = requests.get(url, headers=hdr)
        try:
            r.raise_for_status()
            json_data = r.json()
            return pd.json_normalize(json_data['features'])    
        except requests.HTTPError as exception:
            print(f'error with chunk {id_list[0]}')
            print(exception)
            return None
    
    with tqdm(total=len(objectIDs_chunks)) as pbar:
        # We can use a with statement to ensure threads are cleaned up promptly
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Start the load operations and mark each future with its object_ID
            future_to_object_ID = {executor.submit(get_data, object_IDs, data, columns): chunk_n for chunk_n, object_IDs in enumerate(objectIDs_chunks)}
            for future in concurrent.futures.as_completed(future_to_object_ID):
                chunk_n = future_to_object_ID[future]
                # print(f'{(chunk_n + 1)/len(objectIDs_chunks)*100:0.0f}% completed')
                try:
                    output = pd.concat([output, future.result()], axis=0)
                except Exception as exc:
                    print('Chunk %r generated an exception: %s' % (chunk_n, exc))
                    return
                # else:
                #     print('%r page is %d bytes' % (chunk_n, len(future.result())))
                pbar.update(1)
    return output

In [52]:
# test
testing = parallel_request(mha_apprehensions_objectIDs[:10**4], mha_apprehensions)
testing

100%|██████████| 50/50 [00:11<00:00,  4.36it/s]


Unnamed: 0,attributes.OBJECTID,attributes.EVENT_UNIQUE_ID,attributes.REPORT_DATE,attributes.REPORT_YEAR,attributes.REPORT_MONTH,attributes.REPORT_DOW,attributes.REPORT_DOY,attributes.REPORT_DAY,attributes.REPORT_HOUR,attributes.OCC_DATE,attributes.OCC_YEAR,attributes.OCC_MONTH,attributes.OCC_DOY,attributes.OCC_DAY,attributes.OCC_DOW,attributes.OCC_HOUR,attributes.DIVISION,attributes.PREMISES_TYPE,attributes.APPREHENSION_TYPE,attributes.SEX,attributes.AGE_COHORT,attributes.HOOD_158,attributes.NEIGHBOURHOOD_158,attributes.HOOD_140,attributes.NEIGHBOURHOOD_140,attributes.F_num_removed
0,1601,GO-20141761918,1395633600000,2014,March,Monday,83,24,22,1395633600000,2014,March,83,24,Monday,22,D31,House,Mha Sec 17 (Power Of App),Female,45 to 54,23,Pelmo Park-Humberlea,23,Pelmo Park-Humberlea (23),1
1,1602,GO-20141759666,1395633600000,2014,March,Monday,83,24,15,1395633600000,2014,March,83,24,Monday,15,D53,Other,Mha Sec 15 (Form 1),Male,55 to 64,99,Mount Pleasant East,99,Mount Pleasant East (99),1
2,1603,GO-20141761152,1395633600000,2014,March,Monday,83,24,19,1395633600000,2014,March,83,24,Monday,19,D42,Other,Mha Sec 17 (Power Of App),Female,35 to 44,144,Morningside Heights,131,Rouge (131),1
3,1604,GO-20141760206,1395633600000,2014,March,Monday,83,24,16,1395633600000,2014,March,83,24,Monday,12,D53,Other,Mha Sec 15 (Form 1),Male,45 to 54,95,Annex,95,Annex (95),1
4,1605,GO-20141759312,1395633600000,2014,March,Monday,83,24,14,1395633600000,2014,March,83,24,Monday,14,D12,Outside,Mha Sec 28(1) (Form 9 Elopee),Female,35 to 44,112,Beechborough-Greenbrook,112,Beechborough-Greenbrook (112),1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,9796,GO-2015656829,1429588800000,2015,April,Tuesday,111,21,1,1429588800000,2015,April,111,21,Tuesday,1,D55,House,Mha Sec 17 (Power Of App),Female,18 to 24,69,Blake-Jones,69,Blake-Jones (69),1
196,9797,GO-2015660364,1429588800000,2015,April,Tuesday,111,21,15,1429588800000,2015,April,111,21,Tuesday,15,D43,House,Mha Sec 17 (Power Of App),Female,65+,141,Golfdale-Cedarbrae-Woburn,137,Woburn (137),1
197,9798,GO-2015659874,1429588800000,2015,April,Tuesday,111,21,14,1429588800000,2015,April,111,21,Tuesday,14,D22,Other,Mha Sec 15 (Form 1),Male,65+,158,Islington,14,Islington-City Centre West (14),1
198,9799,GO-2015660310,1429588800000,2015,April,Tuesday,111,21,17,1429588800000,2015,April,111,21,Tuesday,15,D33,Other,Mha Sec 15 (Form 1),Male,35 to 44,42,Banbury-Don Mills,42,Banbury-Don Mills (42),1


In [49]:
testing.columns

Index(['attributes.OBJECTID', 'attributes.EVENT_UNIQUE_ID',
       'attributes.REPORT_DATE', 'attributes.REPORT_YEAR',
       'attributes.REPORT_MONTH', 'attributes.REPORT_DOW',
       'attributes.REPORT_DOY', 'attributes.REPORT_DAY',
       'attributes.REPORT_HOUR', 'attributes.OCC_DATE', 'attributes.OCC_YEAR',
       'attributes.OCC_MONTH', 'attributes.OCC_DOY', 'attributes.OCC_DAY',
       'attributes.OCC_DOW', 'attributes.OCC_HOUR', 'attributes.DIVISION',
       'attributes.PREMISES_TYPE', 'attributes.APPREHENSION_TYPE',
       'attributes.SEX', 'attributes.AGE_COHORT', 'attributes.HOOD_158',
       'attributes.NEIGHBOURHOOD_158', 'attributes.HOOD_140',
       'attributes.NEIGHBOURHOOD_140', 'attributes.F_num_removed'],
      dtype='object')

In [39]:
# # select the columns we need 
# crime_columns = [
#     'attributes.EVENT_UNIQUE_ID',
#     'attributes.LOCATION_TYPE','attributes.PREMISES_TYPE',
#     'attributes.OCC_DATE','attributes.OCC_YEAR','attributes.OCC_MONTH','attributes.OCC_DAY','attributes.OCC_DOY','attributes.OCC_DOW','attributes.OCC_HOUR',
#     'attributes.MCI_CATEGORY',
#     'attributes.DIVISION','attributes.HOOD_158','attributes.NEIGHBOURHOOD_158',
#     'attributes.LONG_WGS84','attributes.LAT_WGS84'
#     ]
# collisions_columns = [
#     'attributes.EventUniqueId',
#     'attributes.EventUniqueId','attributes.Month','attributes.Day_of_Week','attributes.Year','attributes.Hour',
#     'attributes.Division', 'attributes.Atom','attributes.Neighbourhood',
#     'attributes.Fatalities','attributes.Injury_Collisions','attributes.FTR_Collisions','attributes.PD_Collisions',
#     'attributes.Longitude','attributes.Latitude'
#     ]
# pic_calls_columns = [
#     'attributes.EVENT_ID',
#     'attributes.EVENT_DATE','attributes.EVENT_YEAR','attributes.EVENT_MONTH','attributes.EVENT_DOW','attributes.EVENT_HOUR','attributes.EVENT_TYPE',
#     'attributes.OCCURRENCE_CREATED','attributes.APPREHENSION_MADE',
#     'attributes.DIVISION', 'attributes.HOOD_158','attributes.NEIGHBOURHOOD_158'
#     ]
# mha_apprehensions_columns = [
#     'attributes.EVENT_UNIQUE_ID',
#     'attributes.OCC_DATE','attributes.OCC_YEAR','attributes.OCC_MONTH','attributes.OCC_DOW','OccurrenceHour','HoodID','NeighbourhoodName','PremisesType','Sex','AgeGroup']

In [23]:
crime_data = parallel_request(crime_objectIDs, crimes) #, crime_columns)

 18%|█▊        | 290/1617 [01:02<05:32,  3.99it/s]

Chunk 250 generated an exception: 'features'


 18%|█▊        | 291/1617 [05:03<23:02,  1.04s/it]


In [24]:
collisions_data = parallel_request(collisions_objectIDs, traffic_collisions) #, collisions_columns)

100%|██████████| 2769/2769 [10:48<00:00,  4.27it/s]


In [25]:
shootings_data = parallel_request(shootings_objectIDs, shootings)

100%|██████████| 29/29 [00:06<00:00,  4.58it/s]


In [26]:
homicide_data = parallel_request(homicide_objectIDs, homicide)

100%|██████████| 7/7 [00:02<00:00,  2.84it/s]


In [27]:
pic_calls_data = parallel_request(pic_calls_objectIDs, pic_calls) #, pic_calls_columns)

100%|██████████| 1300/1300 [04:34<00:00,  4.74it/s]


In [28]:
mha_apprehensions_data = parallel_request(mha_apprehensions_objectIDs, mha_apprehensions) #, mha_apprehensions_columns)

100%|██████████| 470/470 [01:34<00:00,  4.98it/s]


In [29]:
neighbourhoods_data = parallel_request(neighbourhoods_objectIDs, neighbourhoods)

100%|██████████| 1/1 [00:01<00:00,  1.84s/it]


# Clean column names

In [16]:
def clean_columns(df):    
    
    def convert_timestamp(x):
        if type(x) is pd.Timestamp:
            return x
        else:
            try:
                return datetime.fromtimestamp(x/1000)
            except:
                return np.nan

    df.columns = [x.split('.')[1] if '.' in x else x for x in df.columns]
    df.columns = [x.lower().replace('_', '') for x in df.columns]
    try:
        df['occurrencedate'] = df['occurrencedate'].apply(convert_timestamp)
    except: 
        pass
    
    try:
        df.rename(columns = {
            'agegroup': 'age_group',
            'apprehensionmade': 'apprehension_made',
            'atom': 'hood_id',
            'dayofweek': 'occurrence_dayofweek',
            'eventdate': 'occurrence_date',
            'eventdayofweek': 'occurrence_dayofweek',
            'eventhour': 'occurrence_hour',
            'eventid': 'occurrence_unique_id',
            'eventmonth': 'occurrence_month',
            'eventtype': 'occurrence_type',
            'eventuniqueid': 'occurrence_unique_id',
            'eventyear': 'occurrence_year',
            'ftrcollisions': 'ftr_collisions',
            'homicidetype': 'homicide_type',
            'hoodid': 'hood_id',
            'hour': 'occurrence_hour',
            'injurycollisions': 'injury_collisions',
            'lat': 'latitude',
            'long': 'longitude',
            'mci': 'MCI',
            'month': 'occurrence_month',
            'neighbourhoodname': 'neighbourhood',
            'objectid': 'object_id',
            'occurrencecreated': 'occurrence_created',
            'occurrencedate': 'occurrence_date',
            'occurrenceday': 'occurrence_day',
            'occurrencedayofweek': 'occurrence_dayofweek',
            'occurrencedayofyear': 'occurrence_dayofyear',
            'occurrencehour': 'occurrence_hour',
            'occurrencemonth': 'occurrence_month',
            'occurrencetype': 'occurrence_type',
            'occurrenceuniqueid': 'occurrence_unique_id',
            'occurrenceyear': 'occurrence_year',
            'pdcollisions': 'pd_collisions',
            'premisestype': 'premises_type',
            'timerange': 'time_range',
            'year': 'occurrence_year',
            }, inplace = True)
    except:
        pass        
    
    # some columns have "NULL" instead of NULL
    for x in df.columns:
        df[x] = df[x].replace('NULL', np.nan)
    
    return df

In [20]:
for df in [crime_data, collisions_data, shootings_data, homicide_data, pic_calls_data, mha_apprehensions_data, neighbourhoods_data]:
    df = clean_columns(df)

In [18]:
neighbourhoods_data['geometry'] = neighbourhoods_data.apply(lambda x: shapely.geometry.Polygon(x['rings'][0]), axis = 1)
neighbourhoods_data.drop(columns=['shapearea', 'shapelength', 'rings'], inplace = True)
neighbourhoods_data = gpd.GeoDataFrame(neighbourhoods_data, 
                                crs='epsg:' + str(epsg_3d), 
                                geometry='geometry')
print(type(neighbourhoods_data))

<class 'geopandas.geodataframe.GeoDataFrame'>


# Write to raw_data folder

In [22]:
crime_data.to_csv('../data/raw/Major_Crime_Indicators.csv', index = False)
collisions_data.to_csv('../data/raw/Traffic_Collisions.csv', index = False)
shootings_data.to_csv('../data/raw/Shootings.csv', index = False)
homicide_data.to_csv('../data/raw/Homicide.csv', index = False)
pic_calls_data.to_csv('../data/raw/Persons_in_Crisis_Calls_for_Service_Attended.csv', index = False)
mha_apprehensions_data.to_csv('../data/raw/Mental_Health_Act_Apprehensions.csv', index = False)

In [19]:
neighbourhoods_data.to_file('../data/raw/Neighbourhood_Boundary.geojson', driver='GeoJSON')

  pd.Int64Index,
