# Segmenting and Clustering Neighborhoods in Toronto

In [13]:
import requests 
import lxml.html as lh
import pandas as pd 

In [21]:
wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wikipedia_link)   # handle online content 
wikipedia_page = lh.fromstring(wikipedia_page.content)   # store content under wikipedia_page
tr_elements = wikipedia_page.xpath('//tr')   # parse data that are stored between <tr>..</tr> of HTML
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [22]:
tr_elements = wikipedia_page.xpath('//tr')

col = []   # create empty list 
i = 0 

# for each row, store first element(header) and an empty list 
for t in tr_elements[0]:
    i+=1
    name = t.text_content()
    print(name)
    col.append((name, []))

Postcode
Borough
Neighborhood



In [23]:
col

[('Postcode', []), ('Borough', []), ('Neighborhood\n', [])]

In [30]:
# first row is header, data is stored on second row
for j in range(1, len(tr_elements)):
    
    # T is jth row:
    T = tr_elements[j]
    if len(T)!=3:
        break
    
    # i is index of column
    i = 0
    
    # Iterate through each element of row 
    for t in T.iterchildren():
        data=t.text_content()
        
        # check if row is empty 
        if i>0:
            
            # convert any numerical value to integers
            try:
                data = int(data)
            except:
                pass
            
            # Append data to empty list of i'th columns
            col[i][1].append(data)
            
            # Increment i for next column 
            i+=1

In [31]:
[len(C) for (title, C) in col]

[287, 287, 287]

In [32]:
# Transform dict on a DataFrame
Dict = {title: column for (title, column) in col}
df = pd.DataFrame(Dict)
df

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
8,M8A,Not assigned,Not assigned\n
9,M9A,Queen's Park,Not assigned\n


In [33]:
# clean break lines 
df = df.replace(r'\n', '', regex = True)
df.columns=['Postcode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [35]:
df.drop(df[df['Borough']=='Not assigned'].index,inplace=True)
df.index = range(len(df))
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [36]:
df.shape

(210, 3)

In [40]:
!conda install -c conda-forge geocoder --yes
import geocoder
import pandas as pd
import numpy as np

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/grace/anaconda3

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

  geocoder           conda-forge/noarch::geocoder-1.38.1-py_1
  ratelim            conda-forge/noarch::ratelim-0.1.6-py_2



Downloading and Extracting Packages
geocoder-1.38.1      | 53 KB     | ##################################### | 100% 
ratelim-0.1.6        | 6 KB      | ##################################### | 100% 
Prep

In [42]:
# initialize variable to None 
lat_lng_coords = None

# create extra columns 
df['Latitude'] = pd.Series('', index=df.index)
df['Longitude'] = pd.Series('', index=df.index)
df.columns

Index(['Postcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [53]:
# loop until you get the coordinates
i = 0

sum_latitude = sum(df['Latitude'] == '')

while sum_latitude > 0:
    print('Missing coordinates: ',sum_latitude) 
    if df['Latitude'][i] == '':
        try:
            g = geocoder.google('{}, Toronto, Ontario'.format(df['Neighbourhood'][i]))
            lat_lng_coords = g.latlng
            if g.latlng != None:
                df['Latitude'][i] = lat_lng_coords[0]
                df['Longitude'][i] = lat_lng_coords[1]
        except:
            break
    i = i+1
    sum_latitude = sum(df['Latitude'] == '')


df

Missing coordinates:  210


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Harbourfront,,
3,M6A,North York,Lawrence Heights,,
4,M6A,North York,Lawrence Manor,,
5,M7A,Downtown Toronto,Queen's Park,,
6,M9A,Queen's Park,Not assigned,,
7,M1B,Scarborough,Rouge,,
8,M1B,Scarborough,Malvern,,
9,M3B,North York,Don Mills North,,


In [54]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

#Use geopy library to get the latitude and longitude values of Toronto
address = 'Toronto,ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of ',address,' are {}, {}.'.format(latitude, longitude))

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



  import sys


The geograpical coordinate of  Toronto,ON  are 43.653963, -79.387207.


In [57]:
!conda install -c conda-forge folium=0.5.0 --yes 
import folium      # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [61]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
            
map_toronto

TypeError: must be real number, not str