# Toronto Police Service API
- API documentation: https://developers.arcgis.com/rest/services-reference/enterprise/query-feature-service-layer-.htm 
  - While there is a limit to the number of features included in the feature set response, there is no limit to the number of object IDs returned in the ID array response. Clients can exploit this to get all the query conforming object IDs by specifying returnIdsOnly=true and subsequently requesting feature sets for subsets of object IDs.
  - **Maximum number of records per query** = *`200`*
- Open data license https://data.torontopolice.on.ca/pages/licence 
- Open data documentation (pdf saved in /raw_data folder)
    - The location of crime occurrences have been deliberately offset to the nearest road intersection node to protect the privacy of parties involved in the occurrence. All location data must be considered as an approximate location of the occurrence and users are advised not to interpret any of these locations as related to a specific address or individual.

In [32]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import requests
import concurrent.futures
from datetime import datetime
from tqdm import tqdm
import shapely.geometry
import geopandas as gpd

In [33]:
# Projected Coordinate System (PCS): how you project onto a 2D plane
epsg_2d = 26717 
# Toronto Police Service Open Data includes geographic location information provided in
# the projected coordinate system NAD 1927 UTM 17N
# https://epsg.io/26717

# Geographic Coordinate System (GCS): how coordinates are represented in a 3D space
# World Geodetic System 1984 (WGS84)
# https://epsg.io/4326
epsg_3d = 4326 

# Datasets with API explorer
- Crime https://data.torontopolice.on.ca/datasets/TorontoPS::major-crime-indicators-1/about
- Traffic Collisions https://data.torontopolice.on.ca/datasets/TorontoPS::traffic-collisions-asr-t-tbl-001/about
- Shootings https://data.torontopolice.on.ca/datasets/TorontoPS::shootings-and-firearm-discharges/about
- Homicide https://data.torontopolice.on.ca/datasets/TorontoPS::homicide-asr-rc-tbl-002/about
- Persons in Crisis Calls for Service Attended https://data.torontopolice.on.ca/datasets/TorontoPS::persons-in-crisis-calls-for-service-attended/about
- Mental Health Act Apprehensions https://data.torontopolice.on.ca/datasets/TorontoPS::mental-health-act-apprehensions/about

In [34]:
# name of dataset to query from Toronto Police Service API
crimes = 'Major_Crime_Indicators'
traffic_collisions = 'Traffic_Collisions_(ASR-T-TBL-001)'
shootings = 'Shootings_and_Firearm_Discharges'
homicide = 'Homicide_ASR_RC_TBL_002'
pic_calls = 'PIC_Calls_for_Service_Attended'
mha_apprehensions = 'MHA_Apprehensions'
neighbourhoods = 'Neighbourhood_Crime_Rates_2020'

In [35]:
def get_objectIDs(data):
    url = f'https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/{data}/FeatureServer/0/query?where=1%3D1&outFields=*&' + \
        'returnIdsOnly=true' + \
        '&outSR=4326&f=json'
    r = requests.get(url)
    json_data = r.json()
    objectIDs = sorted(json_data['objectIds'])
    print(f'number of records in {data}: {len(objectIDs):,.0f}')
    return objectIDs

In [36]:
# each record has a unique objectID which we can use as an identifier
crime_objectIDs = get_objectIDs(crimes)
collisions_objectIDs = get_objectIDs(traffic_collisions)
shootings_objectIDs = get_objectIDs(shootings)
homicide_objectIDs = get_objectIDs(homicide)
pic_calls_objectIDs = get_objectIDs(pic_calls)
mha_apprehensions_objectIDs = get_objectIDs(mha_apprehensions)
neighbourhoods_objectIDs = get_objectIDs(neighbourhoods)

number of records in Major_Crime_Indicators: 281,692
number of records in Traffic_Collisions_(ASR-T-TBL-001): 499,538
number of records in Shootings_and_Firearm_Discharges: 5,328
number of records in Homicide_ASR_RC_TBL_002: 1,252
number of records in PIC_Calls_for_Service_Attended: 191,460
number of records in MHA_Apprehensions: 67,958
number of records in Neighbourhood_Crime_Rates_2020: 140


In [37]:
# function to request data from API given the object ID
def parallel_request(objectIDs, data, columns = '*', chunk_size = 200):
    output = None
    # splitting the objectIDs into chunks of 200
    objectIDs_chunks = [objectIDs[i:i+chunk_size] for i in range(0,len(objectIDs),chunk_size)]
    
    # function to get data for each objectID chunk
    def get_data(id_list, data, columns):
        hdr = {'User-Agent': "Mozilla/5.0"}
        bucket = ','.join(map(str, id_list))
        url = 'https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/' + \
            f'{data}/' + \
            'FeatureServer/0/query?objectIds=' + bucket + \
            f'&outFields={columns}' + \
            '&outSR=4326&f=json&returnExceededLimitFeatures=true'
        r = requests.get(url, headers=hdr)
        try:
            r.raise_for_status()
            json_data = r.json()
            return pd.json_normalize(json_data['features'])    
        except requests.HTTPError as exception:
            print(f'error with chunk {id_list[0]}')
            print(exception)
            return None
    
    with tqdm(total=len(objectIDs_chunks)) as pbar:
        # We can use a with statement to ensure threads are cleaned up promptly
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Start the load operations and mark each future with its object_ID
            future_to_object_ID = {executor.submit(get_data, object_IDs, data, columns): chunk_n for chunk_n, object_IDs in enumerate(objectIDs_chunks)}
            for future in concurrent.futures.as_completed(future_to_object_ID):
                chunk_n = future_to_object_ID[future]
                # print(f'{(chunk_n + 1)/len(objectIDs_chunks)*100:0.0f}% completed')
                try:
                    output = pd.concat([output, future.result()], axis=0)
                except Exception as exc:
                    print('Chunk %r generated an exception: %s' % (chunk_n, exc))
                    return
                # else:
                #     print('%r page is %d bytes' % (chunk_n, len(future.result())))
                pbar.update(1)
    return output

In [38]:
# test
parallel_request(crime_objectIDs[:10**4], crimes)

100%|██████████| 50/50 [00:02<00:00, 20.81it/s]


Unnamed: 0,attributes.Index_,attributes.event_unique_id,attributes.Division,attributes.occurrencedate,attributes.reporteddate,attributes.location_type,attributes.premises_type,attributes.ucr_code,attributes.ucr_ext,attributes.offence,attributes.reportedyear,attributes.reportedmonth,attributes.reportedday,attributes.reporteddayofyear,attributes.reporteddayofweek,attributes.reportedhour,attributes.occurrenceyear,attributes.occurrencemonth,attributes.occurrenceday,attributes.occurrencedayofyear,attributes.occurrencedayofweek,attributes.occurrencehour,attributes.MCI,attributes.Hood_ID,attributes.Neighbourhood,attributes.Long,attributes.Lat,attributes.ObjectId,geometry.x,geometry.y
0,10187,GO-20142103239,D23,1400385600000,1400385600000,"Single Home, House (Attach Garage, Cottage, Mo...",House,1430,100,Assault,2014,May,18,138,Sunday,12,2014,May,18,138,Sunday,12,Assault,1,West Humber-Clairville,-79.588477,43.725321,401,-79.588477,43.725321
1,10303,GO-20142111528,D23,1400472000000,1400472000000,"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,1430,100,Assault,2014,May,19,139,Monday,21,2014,May,19,139,Monday,20,Assault,1,West Humber-Clairville,-79.600166,43.750187,402,-79.600166,43.750187
2,10305,GO-20142111859,D23,1400472000000,1400472000000,"Apartment (Rooming House, Condo)",Apartment,2120,200,B&E,2014,May,19,139,Monday,22,2014,May,19,139,Monday,22,Break and Enter,1,West Humber-Clairville,-79.603420,43.719158,403,-79.603420,43.719158
3,10350,GO-20142116041,D23,1400558400000,1400558400000,"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,1420,100,Assault With Weapon,2014,May,20,140,Tuesday,14,2014,May,20,140,Tuesday,14,Assault,1,West Humber-Clairville,-79.590332,43.734013,404,-79.590332,43.734013
4,10351,GO-20142116041,D23,1400558400000,1400558400000,"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,1420,110,Assault Bodily Harm,2014,May,20,140,Tuesday,14,2014,May,20,140,Tuesday,14,Assault,1,West Humber-Clairville,-79.590332,43.734013,405,-79.590332,43.734013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,205316,GO-20201483426,D23,1556683200000,1596859200000,"Apartment (Rooming House, Condo)",Apartment,1430,100,Assault,2020,August,8,221,Saturday,18,2019,May,1,121,Wednesday,12,Assault,2,Mount Olive-Silverstone-Jamestown,-79.593609,43.744732,9196,-79.593609,43.744732
196,205513,GO-20201498264,D23,1597032000000,1597032000000,"Single Home, House (Attach Garage, Cottage, Mo...",House,1430,100,Assault,2020,August,10,223,Monday,21,2020,August,10,223,Monday,21,Assault,2,Mount Olive-Silverstone-Jamestown,-79.593332,43.749125,9197,-79.593332,43.749125
197,205556,GO-20201502137,D23,1597118400000,1597118400000,"Apartment (Rooming House, Condo)",Apartment,1420,100,Assault With Weapon,2020,August,11,224,Tuesday,13,2020,August,11,224,Tuesday,13,Assault,2,Mount Olive-Silverstone-Jamestown,-79.588308,43.756235,9198,-79.588308,43.756235
198,205557,GO-20201500842,D23,1597118400000,1597118400000,"Apartment (Rooming House, Condo)",Apartment,1430,100,Assault,2020,August,11,224,Tuesday,9,2020,August,11,224,Tuesday,9,Assault,2,Mount Olive-Silverstone-Jamestown,-79.588308,43.756235,9199,-79.588308,43.756235


In [39]:
# select the columns we need
crime_columns = 'event_unique_id,occurrencedate,premises_type,occurrenceyear,occurrencemonth,occurrenceday,occurrencedayofyear,occurrencedayofweek,occurrencehour,MCI,hood_id,Neighbourhood,Long,Lat'
collisions_columns = 'EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,Longitude,Latitude'
pic_calls_columns = 'EventID,EventDate,EventYear,EventMonth,EventDayOfWeek,EventHour,EventType,OccurrenceCreated,ApprehensionMade,hood_id,NeighbourhoodName'
mha_apprehensions_columns = 'EventUniqueID,OccurrenceDate,OccurrenceYear,OccurrenceMonth,OccurrenceDayofWeek,OccurrenceHour,HoodID,NeighbourhoodName,PremisesType,Sex,AgeGroup'

In [40]:
crime_data = parallel_request(crime_objectIDs, crimes, crime_columns)

100%|██████████| 1409/1409 [01:30<00:00, 15.60it/s]


In [41]:
collisions_data = parallel_request(collisions_objectIDs, traffic_collisions, collisions_columns)

100%|██████████| 2498/2498 [06:24<00:00,  6.49it/s]


In [42]:
shootings_data = parallel_request(shootings_objectIDs, shootings)

100%|██████████| 27/27 [00:01<00:00, 17.91it/s]


In [43]:
homicide_data = parallel_request(homicide_objectIDs, homicide)

100%|██████████| 7/7 [00:00<00:00, 13.18it/s]


In [44]:
pic_calls_data = parallel_request(pic_calls_objectIDs, pic_calls, pic_calls_columns)

100%|██████████| 958/958 [01:03<00:00, 14.99it/s]


In [45]:
mha_apprehensions_data = parallel_request(mha_apprehensions_objectIDs, mha_apprehensions, mha_apprehensions_columns)

100%|██████████| 340/340 [00:18<00:00, 18.33it/s]


In [46]:
neighbourhoods_data = parallel_request(neighbourhoods_objectIDs, neighbourhoods)

100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


# Clean column names

In [47]:
def clean_columns(df):    
    
    def convert_timestamp(x):
        if type(x) is pd.Timestamp:
            return x
        else:
            try:
                return datetime.fromtimestamp(x/1000)
            except:
                return np.nan

    df.columns = [x.split('.')[1] if '.' in x else x for x in df.columns]
    df.columns = [x.lower().replace('_', '') for x in df.columns]
    try:
        df['occurrencedate'] = df['occurrencedate'].apply(convert_timestamp)
    except: 
        pass
    
    try:
        df.rename(columns = {
            'agegroup': 'age_group',
            'apprehensionmade': 'apprehension_made',
            'atom': 'hood_id',
            'dayofweek': 'occurrence_dayofweek',
            'eventdate': 'occurrence_date',
            'eventdayofweek': 'occurrence_dayofweek',
            'eventhour': 'occurrence_hour',
            'eventid': 'occurrence_unique_id',
            'eventmonth': 'occurrence_month',
            'eventtype': 'occurrence_type',
            'eventuniqueid': 'occurrence_unique_id',
            'eventyear': 'occurrence_year',
            'ftrcollisions': 'ftr_collisions',
            'homicidetype': 'homicide_type',
            'hoodid': 'hood_id',
            'hour': 'occurrence_hour',
            'injurycollisions': 'injury_collisions',
            'lat': 'latitude',
            'long': 'longitude',
            'mci': 'MCI',
            'month': 'occurrence_month',
            'neighbourhoodname': 'neighbourhood',
            'objectid': 'object_id',
            'occurrencecreated': 'occurrence_created',
            'occurrencedate': 'occurrence_date',
            'occurrenceday': 'occurrence_day',
            'occurrencedayofweek': 'occurrence_dayofweek',
            'occurrencedayofyear': 'occurrence_dayofyear',
            'occurrencehour': 'occurrence_hour',
            'occurrencemonth': 'occurrence_month',
            'occurrencetype': 'occurrence_type',
            'occurrenceuniqueid': 'occurrence_unique_id',
            'occurrenceyear': 'occurrence_year',
            'pdcollisions': 'pd_collisions',
            'premisestype': 'premises_type',
            'timerange': 'time_range',
            'year': 'occurrence_year',
            }, inplace = True)
    except:
        pass        
    
    # some columns have "NULL" instead of NULL
    for x in df.columns:
        df[x] = df[x].replace('NULL', np.nan)
    
    return df

In [48]:
for df in [crime_data, collisions_data, shootings_data, homicide_data, pic_calls_data, mha_apprehensions_data, neighbourhoods_data]:
    df = clean_columns(df)

In [49]:
neighbourhoods_data['geometry'] = neighbourhoods_data.apply(lambda x: shapely.geometry.Polygon(x['rings'][0]), axis = 1)
neighbourhoods_data.drop(columns=['shapearea', 'shapelength', 'rings'], inplace = True)
neighbourhoods_data = gpd.GeoDataFrame(neighbourhoods_data, 
                                crs='epsg:' + str(epsg_3d), 
                                geometry='geometry')
print(type(neighbourhoods_data))

<class 'geopandas.geodataframe.GeoDataFrame'>


# Write to raw_data folder

In [50]:
crime_data.to_csv('../data/raw/Major_Crime_Indicators.csv', index = False)
collisions_data.to_csv('../data/raw/Traffic_Collisions.csv', index = False)
shootings_data.to_csv('../data/raw/Shootings.csv', index = False)
homicide_data.to_csv('../data/raw/Homicide.csv', index = False)
pic_calls_data.to_csv('../data/raw/Persons_in_Crisis_Calls_for_Service_Attended.csv', index = False)
mha_apprehensions_data.to_csv('../data/raw/Mental_Health_Act_Apprehensions.csv', index = False)

In [51]:
neighbourhoods_data.to_file('../data/raw/Neighbourhood_Crime_Rates_2020.geojson', driver='GeoJSON')

  pd.Int64Index,


In [52]:
# Extra notes
# import asyncio
# import aiohttp
# import time

# async def get(url, session):
#     try:
#         async with session.get(url=url) as response:
#             resp = await response.read()
#             print("Successfully got url")
#     except Exception as e:
#         print("Unable to get url due to {}.".format(e.__class__))


# async def main(urls):
#     async with aiohttp.ClientSession() as session:
#         ret = await asyncio.gather(*[get(url, session) for url in urls])
#     print("Finalized all. Return is a list of len {} outputs.".format(len(ret)))


# start = time.time()
# asyncio.run(await main([url + ','.join(map(str,chunk)) for chunk in objectIDs_chunks]))
# end = time.time()

# print(f"Took {end - start} seconds to pull {len(objectIDs_chunks)} requests.")