# Toronto Police Service API
- API documentation: https://developers.arcgis.com/rest/services-reference/enterprise/query-feature-service-layer-.htm 
  - While there is a limit to the number of features included in the feature set response, there is no limit to the number of object IDs returned in the ID array response. Clients can exploit this to get all the query conforming object IDs by specifying returnIdsOnly=true and subsequently requesting feature sets for subsets of object IDs.
  - **Maximum number of records per query** = *`200`*
- Open data license https://data.torontopolice.on.ca/pages/licence 
- Open data documentation (pdf saved in /raw_data folder)
    - The location of crime occurrences have been deliberately offset to the nearest road intersection node to protect the privacy of parties involved in the occurrence. All location data must be considered as an approximate location of the occurrence and users are advised not to interpret any of these locations as related to a specific address or individual.

In [154]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import requests
import concurrent.futures
from datetime import datetime

# Datasets with API explorer
- Crime https://data.torontopolice.on.ca/datasets/TorontoPS::major-crime-indicators-1/about
- Traffic Collisions https://data.torontopolice.on.ca/datasets/TorontoPS::traffic-collisions-asr-t-tbl-001/about
- Shootings https://data.torontopolice.on.ca/datasets/TorontoPS::shootings-and-firearm-discharges/about
- Homicide https://data.torontopolice.on.ca/datasets/TorontoPS::homicide-asr-rc-tbl-002/about

In [82]:
# name of dataset to query from Toronto Police Service API
crimes = 'Major_Crime_Indicators'
traffic_collisions = 'Traffic_Collisions_(ASR-T-TBL-001)'
shootings = 'Shootings_and_Firearm_Discharges'
homicide = 'Homicide_ASR_RC_TBL_002'

In [80]:
def get_objectIDs(data):
    url = f'https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/{data}/FeatureServer/0/query?where=1%3D1&outFields=*&' + \
        'returnIdsOnly=true' + \
        '&outSR=4326&f=json'
    r = requests.get(url)
    json_data = r.json()
    objectIDs = sorted(json_data['objectIds'])
    print(f'number of records in {data}: {len(objectIDs):,.0f}')
    return objectIDs

In [83]:
# each record has a unique objectID which we can use as an identifier
crime_objectIDs = get_objectIDs(crimes)
collisions_objectIDs = get_objectIDs(traffic_collisions)
shootings_objectIDs = get_objectIDs(shootings)
homicide_objectIDs = get_objectIDs(homicide)

number of records in Major_Crime_Indicators: 281,692
number of records in Traffic_Collisions_(ASR-T-TBL-001): 499,538
number of records in Shootings_and_Firearm_Discharges: 5,328
number of records in Homicide_ASR_RC_TBL_002: 1,252


In [104]:
# function to request data from API given the object ID
def parallel_request(objectIDs, data, columns = '*', chunk_size = 200):
    output = None
    # splitting the objectIDs into chunks of 200
    objectIDs_chunks = [objectIDs[i:i+chunk_size] for i in range(0,len(objectIDs),chunk_size)]
    
    # function to get data for each objectID chunk
    def get_data(id_list, data, columns):
        hdr = {'User-Agent': "Mozilla/5.0"}
        bucket = ','.join(map(str, id_list))
        url = 'https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/' + \
            f'{data}/' + \
            'FeatureServer/0/query?objectIds=' + bucket + \
            f'&outFields={columns}' + \
            '&outSR=4326&f=json&returnExceededLimitFeatures=true'
        r = requests.get(url, headers=hdr)
        try:
            r.raise_for_status()
            json_data = r.json()
            return pd.json_normalize(json_data['features'])    
        except requests.HTTPError as exception:
            print(f'error with chunk {id_list[0]}')
            print(exception)
            return None

    # We can use a with statement to ensure threads are cleaned up promptly
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Start the load operations and mark each future with its object_ID
        future_to_object_ID = {executor.submit(get_data, object_IDs, data, columns): chunk_n for chunk_n, object_IDs in enumerate(objectIDs_chunks)}
        for future in concurrent.futures.as_completed(future_to_object_ID):
            chunk_n = future_to_object_ID[future]
            print(f'{(chunk_n + 1)/len(objectIDs_chunks)*100:0.0f}% completed')
            try:
                output = pd.concat([output, future.result()], axis=0)
            except Exception as exc:
                print('Chunk %r generated an exception: %s' % (chunk_n, exc))
                return
            # else:
            #     print('%r page is %d bytes' % (chunk_n, len(future.result())))
    return output

In [111]:
# test
parallel_request(crime_objectIDs[:10**4], crimes)

6% completed
12% completed
4% completed
18% completed
16% completed
8% completed
22% completed
24% completed
14% completed
10% completed
20% completed
2% completed
30% completed
28% completed
26% completed
32% completed
36% completed
54% completed
48% completed
44% completed
46% completed
42% completed
50% completed
52% completed
40% completed
58% completed
60% completed
34% completed
38% completed
62% completed
56% completed
66% completed
64% completed
70% completed
82% completed
76% completed
68% completed
74% completed
84% completed
80% completed
88% completed
72% completed
86% completed
78% completed
90% completed
92% completed
96% completed
94% completed
100% completed
98% completed


Unnamed: 0,attributes.Index_,attributes.event_unique_id,attributes.Division,attributes.occurrencedate,attributes.reporteddate,attributes.location_type,attributes.premises_type,attributes.ucr_code,attributes.ucr_ext,attributes.offence,attributes.reportedyear,attributes.reportedmonth,attributes.reportedday,attributes.reporteddayofyear,attributes.reporteddayofweek,attributes.reportedhour,attributes.occurrenceyear,attributes.occurrencemonth,attributes.occurrenceday,attributes.occurrencedayofyear,attributes.occurrencedayofweek,attributes.occurrencehour,attributes.MCI,attributes.Hood_ID,attributes.Neighbourhood,attributes.Long,attributes.Lat,attributes.ObjectId,geometry.x,geometry.y
0,10187,GO-20142103239,D23,1400385600000,1400385600000,"Single Home, House (Attach Garage, Cottage, Mo...",House,1430,100,Assault,2014,May,18,138,Sunday,12,2014,May,18,138,Sunday,12,Assault,1,West Humber-Clairville,-79.588477,43.725321,401,-79.588477,43.725321
1,10303,GO-20142111528,D23,1400472000000,1400472000000,"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,1430,100,Assault,2014,May,19,139,Monday,21,2014,May,19,139,Monday,20,Assault,1,West Humber-Clairville,-79.600166,43.750187,402,-79.600166,43.750187
2,10305,GO-20142111859,D23,1400472000000,1400472000000,"Apartment (Rooming House, Condo)",Apartment,2120,200,B&E,2014,May,19,139,Monday,22,2014,May,19,139,Monday,22,Break and Enter,1,West Humber-Clairville,-79.603420,43.719158,403,-79.603420,43.719158
3,10350,GO-20142116041,D23,1400558400000,1400558400000,"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,1420,100,Assault With Weapon,2014,May,20,140,Tuesday,14,2014,May,20,140,Tuesday,14,Assault,1,West Humber-Clairville,-79.590332,43.734013,404,-79.590332,43.734013
4,10351,GO-20142116041,D23,1400558400000,1400558400000,"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,1420,110,Assault Bodily Harm,2014,May,20,140,Tuesday,14,2014,May,20,140,Tuesday,14,Assault,1,West Humber-Clairville,-79.590332,43.734013,405,-79.590332,43.734013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,125190,GO-2018389189,D23,1519966800000,1519966800000,"Apartment (Rooming House, Condo)",Apartment,1430,100,Assault,2018,March,2,61,Friday,12,2018,March,2,61,Friday,10,Assault,2,Mount Olive-Silverstone-Jamestown,-79.587060,43.753621,9796,-79.587060,43.753621
196,125323,GO-2018398719,D23,1520053200000,1520053200000,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,1610,100,Robbery With Weapon,2018,March,3,62,Saturday,20,2018,March,3,62,Saturday,20,Robbery,2,Mount Olive-Silverstone-Jamestown,-79.589668,43.742918,9797,-79.589668,43.742918
197,125324,GO-2018398719,D23,1520053200000,1520053200000,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,1610,100,Robbery With Weapon,2018,March,3,62,Saturday,20,2018,March,3,62,Saturday,20,Robbery,2,Mount Olive-Silverstone-Jamestown,-79.589668,43.742918,9798,-79.589668,43.742918
198,125332,GO-2018399474,D23,1520053200000,1520053200000,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,1430,100,Assault,2018,March,3,62,Saturday,22,2018,March,3,62,Saturday,21,Assault,2,Mount Olive-Silverstone-Jamestown,-79.589193,43.741095,9799,-79.589193,43.741095


In [116]:
# select the columns we need
crime_columns = 'event_unique_id,occurrencedate,premises_type,occurrenceyear,occurrencemonth,occurrenceday,occurrencedayofyear,occurrencedayofweek,occurrencehour,MCI,Hood_ID,Neighbourhood,Long,Lat'
collisions_columns = 'EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,Longitude,Latitude'

In [120]:
# loading time: 9 minutes
crime_data = parallel_request(crime_objectIDs, crimes, crime_columns)
collisions_data = parallel_request(collisions_objectIDs, traffic_collisions, collisions_columns)
shootings_data = parallel_request(shootings_objectIDs, shootings)
homicide_data = parallel_request(homicide_objectIDs, homicide)

0% completed
0% completed
0% completed
0% completed
1% completed
1% completed
1% completed
0% completed
0% completed
1% completed
1% completed
0% completed
1% completed
1% completed
1% completed
1% completed
1% completed
2% completed
2% completed
2% completed
2% completed
2% completed
2% completed
2% completed
2% completed
2% completed
1% completed
1% completed
1% completed
2% completed
3% completed
1% completed
2% completed
2% completed
3% completed
3% completed
3% completed
2% completed
3% completed
3% completed
3% completed
3% completed
3% completed
2% completed
3% completed
3% completed
3% completed
4% completed
3% completed
3% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
4% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
5% completed
6% completed

# Clean data

In [188]:
def clean_columns(df):    
    
    def convert_timestamp(x):
        try:
            return datetime.fromtimestamp(x/1000)
        except:
            return np.nan

    df.columns = [x.split('.')[1] if '.' in x else x for x in df.columns]
    df.columns = [x.lower().replace('_', '') for x in df.columns]
    df['occurrencedate'] = df['occurrencedate'].apply(convert_timestamp)
    
    return df

In [190]:
for df in [crime_data, collisions_data, shootings_data, homicide_data]:
    df = clean_columns(df)

# Write to csv

In [197]:
crime_data.to_csv('./raw_data/Major_Crime_Indicators.csv', index = False)
collisions_data.to_csv('./raw_data/Traffic_Collisions.csv', index = False)
shootings_data.to_csv('./raw_data/Shootings.csv', index = False)
homicide_data.to_csv('./raw_data/Homicide.csv', index = False)

In [None]:
# import asyncio
# import aiohttp
# import time

# async def get(url, session):
#     try:
#         async with session.get(url=url) as response:
#             resp = await response.read()
#             print("Successfully got url")
#     except Exception as e:
#         print("Unable to get url due to {}.".format(e.__class__))


# async def main(urls):
#     async with aiohttp.ClientSession() as session:
#         ret = await asyncio.gather(*[get(url, session) for url in urls])
#     print("Finalized all. Return is a list of len {} outputs.".format(len(ret)))


# start = time.time()
# asyncio.run(await main([url + ','.join(map(str,chunk)) for chunk in objectIDs_chunks]))
# end = time.time()

# print(f"Took {end - start} seconds to pull {len(objectIDs_chunks)} requests.")