## Toronto One Address Shapefile Processing ##

#### Description ####

Due to large amount data processing needed for street addresses and the limitation of the Geopy API, I'm trying to use the Toronto address repository to process the street coordinates instead of Geopy API. 

- It converts the shape file into df, and exports for later use. 
- Finds the neighbourhood area with GeoJson file.
- Test for coordinates matching and merging.


Data source: <br>
https://open.toronto.ca/dataset/address-points-municipal-toronto-one-address-repository/ <br>
https://open.toronto.ca/dataset/neighbourhoods/

In [14]:
# Import libraries
import pandas as pd
import numpy as np
import shapefile
import json
from shapely.geometry import Point, Polygon
import timeit

In [4]:
# Reading the shape file from Toronto One Address Repository

def read_shapefile(file_path):
    
    #read file
    sf = shapefile.Reader(file_path)
    fields = [x[0] for x in sf.fields][1:]
    records = sf.records()
    shps = [s.points for s in sf.shapes()]
    
    #write to df
    toronto_addresses_df = pd.DataFrame(columns=fields, data=records)
    #toronto_addresses_df = toronto_addresses_df.assign(coords=shps)
    
    toronto_addresses_df['full_address'] = toronto_addresses_df['ADDRESS'] + ' ' + toronto_addresses_df['LFNAME']
    toronto_addresses_df['long_lat'] = toronto_addresses_df[['LONGITUDE','LATITUDE']].apply(tuple, axis=1)
    
    return toronto_addresses_df

toronto_addresses_file = 'geo_data/municipal-address-points-wgs84-latitude-longitude/ADDRESS_POINT_WGS84.shp'

toronto_addresses_df = read_shapefile(toronto_addresses_file)

In [5]:
# converting full_address to uppercase for easy matching with address in ticket data
toronto_addresses_df['full_address'] = toronto_addresses_df['full_address'].str.upper()
toronto_addresses_df.head()
toronto_addresses_df.dtypes

GEO_ID            int64
LINK              int64
MAINT_STAG       object
ADDRESS          object
LFNAME           object
LONUM             int64
LONUMSUF         object
HINUM             int64
HINUMSUF         object
ARC_SIDE         object
DISTANCE        float64
FCODE             int64
FCODE_DES        object
CLASS            object
NAME             object
X               float64
Y               float64
LONGITUDE       float64
LATITUDE        float64
MUN_NAME         object
WARD_NAME        object
full_address     object
long_lat         object
dtype: object

In [6]:
#toronto_addresses_df.to_csv('geo_data/geodata_toronto_addresses.csv', index=False)

In [7]:
#check if one address is can be found in toronto_addresses df
test_address = '202 DOVERCOURT RD'
#test_address = 'Ford St and St Clair Ave w' #doesn't really work for intersections

for i in range(len(toronto_addresses_df)):
    actual_address = toronto_addresses_df.loc[i, 'full_address']
    if test_address == actual_address:
        print('match found')
        print(toronto_addresses_df.loc[i, 'long_lat'])
        break
    else:
        pass

match found
(-79.4240340808, 43.6469600856)


In [8]:
# test dataframe
test_data = pd.read_csv('test_data.csv')
test_df = pd.DataFrame(test_data)
print('Number of rows and cols: ', str(test_df.shape))
test_df.head()

Number of rows and cols:  (49, 11)


Unnamed: 0,tag_number_masked,date_of_infraction,infraction_code,infraction_description,set_fine_amount,time_of_infraction,location1,location2,location3,location4,province
0,***71720,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,12,NR,202 DOVERCOURT RD,,,ON
1,***61115,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,19,W/S,GREAT WEST DR,S/O,DE JONG ST,ON
2,***61117,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,22,NR,3 DE JONG ST,,,ON
3,***61118,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,23,OPP,54 ZEZEL WAY,,,ON
4,***92520,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,24,E/S,FORD ST,S/O,ST CLAIR AVE W,ON


In [9]:
# keep the rows with real addresses, does not include intersections
for i in range(len(test_df)):
    test_df.loc[i, 'address_exist'] = test_df.loc[i, 'location2'][0].isdigit()

test_df = test_df[test_df['address_exist']]
test_df = test_df.reset_index(drop=True)
test_df = test_df.drop('address_exist', 1)
test_df.head()       

Unnamed: 0,tag_number_masked,date_of_infraction,infraction_code,infraction_description,set_fine_amount,time_of_infraction,location1,location2,location3,location4,province
0,***71720,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,12,NR,202 DOVERCOURT RD,,,ON
1,***61117,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,22,NR,3 DE JONG ST,,,ON
2,***61118,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,23,OPP,54 ZEZEL WAY,,,ON
3,***92328,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,28,NR,104 BOWIE AVE,,,ON
4,***92329,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,28,NR,103 BOWIE AVE,,,ON


In [10]:
# Print long_lat if a match is found --- this method is slow as hell
# for i in range(len(test_df)):
#     actual_address = test_df.loc[i, 'full_address']
#     print(actual_address)

#     for j in range(len(toronto_addresses_df)):
#         if actual_address == toronto_addresses_df.loc[j, 'full_address']:
#             #test_df.loc[i, 'long_lat'] = toronto_addresses_df.loc[j, 'long_lat']
#             print('match found')
#             print(toronto_addresses_df.loc[j, 'long_lat'])
#             break
#         else:
#             pass

In [11]:
# Try a table join to get the long_lat info
test_df = pd.merge(test_df, 
                   toronto_addresses_df[['full_address', 'MUN_NAME', 'WARD_NAME', 'long_lat']], 
                   left_on='location2',
                   right_on='full_address',
                   how='left')

In [12]:
test_df.head()

Unnamed: 0,tag_number_masked,date_of_infraction,infraction_code,infraction_description,set_fine_amount,time_of_infraction,location1,location2,location3,location4,province,full_address,MUN_NAME,WARD_NAME,long_lat
0,***71720,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,12,NR,202 DOVERCOURT RD,,,ON,202 DOVERCOURT RD,former Toronto,Davenport,"(-79.4240340808, 43.6469600856)"
1,***61117,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,22,NR,3 DE JONG ST,,,ON,3 DE JONG ST,Scarborough,Scarborough Centre,"(-79.274909091, 43.7659469)"
2,***61118,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,23,OPP,54 ZEZEL WAY,,,ON,54 ZEZEL WAY,Scarborough,Scarborough Centre,"(-79.275425111, 43.766147795)"
3,***92328,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,28,NR,104 BOWIE AVE,,,ON,104 BOWIE AVE,York,Eglinton-Lawrence,"(-79.4566773136, 43.696556066)"
4,***92329,20180101,5,PARK-SIGNED HWY-PROHIBIT DY/TM,50,28,NR,103 BOWIE AVE,,,ON,103 BOWIE AVE,York,Eglinton-Lawrence,"(-79.4561666741, 43.6962863485)"


### Add ID and Area to Each Address ###
#### Import Toronto GeoJSON file ####

In [13]:
toronto_areas = json.load(open('geo_data/Neighbourhoods_geojson.json'))

In [15]:
toronto_addresses_df['address_point'] = toronto_addresses_df.apply(lambda x: Point(x['long_lat']), axis=1)

We will use <b>Shapely</b> to help decide whether an address falls into an area polygon bound.
https://pypi.org/project/Shapely/

In [16]:
# convert the coordinate list to tuple, so it could be used in Shapely's within() method
area = toronto_areas['features'][1]['geometry']['coordinates']

flat_list = []
for sublist in area:
    for i in sublist:
        flat_list.append(i)
        
converted = [tuple(i) for i in flat_list]

In [17]:
# Convert the coordinates in a list to tuple for Shapely to consume

for item in toronto_areas['features']:
    area = item['geometry']['coordinates']
    flat_list = []
    for sublist in area:
        for i in sublist:
            flat_list.append(i)

    converted = [tuple(i) for i in flat_list]
    item['geometry']['coordinates'] = converted

#### Method 1: To loop through thetoronto_addresses_df ####

In [35]:
def get_area1():
    for i in range(100):
        ticket_point = toronto_addresses_df.loc[i, 'address_point']
        for j in toronto_areas['features']:
            try:
                area = j['geometry']['coordinates']   
                if ticket_point.within(Polygon(area)):
                    toronto_addresses_df.loc[i, '_id'] = j['properties']['_id']
                    toronto_addresses_df.loc[i, 'area_name'] = j['properties']['AREA_NAME']
                    break
            except:
                pass
            
t = timeit.Timer("get_area1()", globals=globals())
time = t.timeit(1)
print ("took %fs\n" % (time,))

took 4.793628s



In [19]:
toronto_addresses_df.head()

Unnamed: 0,GEO_ID,LINK,MAINT_STAG,ADDRESS,LFNAME,LONUM,LONUMSUF,HINUM,HINUMSUF,ARC_SIDE,...,Y,LONGITUDE,LATITUDE,MUN_NAME,WARD_NAME,full_address,long_lat,address_point,_id,area_name
0,5729533,5729516,REGULAR,404,Lake Promenade,404,,0,,R,...,4827441.25,-79.540536,43.586906,Etobicoke,Etobicoke-Lakeshore,404 LAKE PROMENADE,"(-79.5405358155, 43.5869056491)",POINT (-79.54053581549999 43.5869056491),8412.0,Long Branch (19)
1,5729531,5729516,REGULAR,402,Lake Promenade,402,,0,,R,...,4827450.035,-79.540385,43.586985,Etobicoke,Etobicoke-Lakeshore,402 LAKE PROMENADE,"(-79.5403852582, 43.586984775)",POINT (-79.5403852582 43.586984775),8412.0,Long Branch (19)
2,5729535,5729516,REGULAR,407,Lake Promenade,407,,0,,L,...,4827413.878,-79.539922,43.586659,Etobicoke,Etobicoke-Lakeshore,407 LAKE PROMENADE,"(-79.5399217575, 43.5866594714)",POINT (-79.5399217575 43.5866594714),8412.0,Long Branch (19)
3,5729534,5729516,REGULAR,405,Lake Promenade,405,,0,,L,...,4827422.263,-79.539803,43.586735,Etobicoke,Etobicoke-Lakeshore,405 LAKE PROMENADE,"(-79.5398028653, 43.5867349937)",POINT (-79.5398028653 43.5867349937),8412.0,Long Branch (19)
4,5729532,5729516,REGULAR,403,Lake Promenade,403,,0,,L,...,4827433.953,-79.539714,43.58684,Etobicoke,Etobicoke-Lakeshore,403 LAKE PROMENADE,"(-79.5397142074, 43.5868402558)",POINT (-79.5397142074 43.5868402558),8412.0,Long Branch (19)


#### Method 2: To make the toronto_addresses_df into a dict and loop through the dict ####

In [28]:
toronto_addresses_df_copy = toronto_addresses_df.to_dict('records')

In [31]:
toronto_addresses_df_copy[1]['long_lat']

(-79.5403852582, 43.586984775)

In [32]:
def get_area2():

    for i in range(100):
        ticket_point = toronto_addresses_df_copy[i]['address_point']
        for j in toronto_areas['features']:
            try:
                area = j['geometry']['coordinates']   
                if ticket_point.within(Polygon(area)):
                    toronto_addresses_df_copy[i]['_id'] = j['properties']['_id']
                    toronto_addresses_df_copy[i]['area_name'] = j['properties']['AREA_NAME']
                    break
            except:
                pass
            
t = timeit.Timer("get_area2()", globals=globals())
time = t.timeit(1)
print ("took %fs\n" % (time,))

took 0.332830s



#### Use Method 2 to Add ID and Area ####

In [49]:
toronto_addresses_df_copy = toronto_addresses_df.to_dict('records')

In [50]:
def get_area():
    for i in range(len(toronto_addresses_df_copy)):
        ticket_point = toronto_addresses_df_copy[i]['address_point']
        for j in toronto_areas['features']:
            try:
                area = j['geometry']['coordinates']   
                if ticket_point.within(Polygon(area)):
                    toronto_addresses_df_copy[i]['_id'] = j['properties']['_id']
                    toronto_addresses_df_copy[i]['area_name'] = j['properties']['AREA_NAME']
                    break
            except:
                pass
            
t = timeit.Timer("get_area()", globals=globals())
time = t.timeit(1)
print ("took %fs\n" % (time,))

took 3066.979188s



In [52]:
toronto_addresses_df_copy = pd.DataFrame(toronto_addresses_df_copy)

In [53]:
toronto_addresses_df_copy.tail()

Unnamed: 0,GEO_ID,LINK,MAINT_STAG,ADDRESS,LFNAME,LONUM,LONUMSUF,HINUM,HINUMSUF,ARC_SIDE,...,Y,LONGITUDE,LATITUDE,MUN_NAME,WARD_NAME,full_address,long_lat,address_point,_id,area_name
526785,18632,30073953,REGULAR,213,Browning Ave,213,,0,,R,...,4838097.303,-79.347938,43.682722,East York,Toronto-Danforth,213 BROWNING AVE,"(-79.347937751, 43.682722372)",POINT (-79.347937751 43.682722372),8492,Broadview North (57)
526786,18634,30073953,REGULAR,215,Browning Ave,215,,0,,R,...,4838098.902,-79.347886,43.682737,East York,Toronto-Danforth,215 BROWNING AVE,"(-79.347885631, 43.682736698)",POINT (-79.347885631 43.682736698),8492,Broadview North (57)
526787,31139,30073950,REGULAR,33,Palmer Ave,33,,0,,R,...,4839127.978,-79.300098,43.691926,East York,Beaches-East York,33 PALMER AVE,"(-79.300098156, 43.691926133)",POINT (-79.300098156 43.691926133),8455,Taylor-Massey (61)
526788,31140,30073950,REGULAR,35,Palmer Ave,35,,0,,R,...,4839135.884,-79.299995,43.691997,East York,Beaches-East York,35 PALMER AVE,"(-79.299994863, 43.691997115)",POINT (-79.29999486299999 43.691997115),8455,Taylor-Massey (61)
526789,31141,30073950,REGULAR,37,Palmer Ave,37,,0,,R,...,4839138.713,-79.300086,43.692023,East York,Beaches-East York,37 PALMER AVE,"(-79.300085732, 43.692022744)",POINT (-79.300085732 43.692022744),8455,Taylor-Massey (61)


In [54]:
toronto_addresses_df_copy.to_csv('geo_data/geodata_toronto_addresses_areas.csv', index=False)