In [2]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans


Next step is to web-scrape the table using requests and BeautifulSoup

In [3]:
htmlText = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = bs(htmlText,'html5lib')
table = soup.find('table').prettify()
print(type(table))

<class 'str'>


In [None]:
postCodes = []
boroughs = []
neighb = []
for tr in soup.find_all('tr')[2:290]:
    tds = tr.find_all('td')
    postCodes.append(tds[0].text)
    boroughs.append(tds[1].text)
    neighb.append(tds[2].text.rstrip())

In [5]:
tuples = list(zip(postCodes,boroughs,neighb))

In [6]:

table_df = pd.DataFrame(tuples, columns=['PostCode','Borough','Neighbourhood'])
table_df.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M8A,Not assigned,Not assigned
8,M9A,Queen's Park,Queen's Park
9,M1B,Scarborough,Rouge


In [10]:
for index, row in table_df.iterrows():
    if row[1]=='Not assigned':
        table_df.drop(index, inplace=True)
    elif row[2]=='Not assigned':
        row[2] = row[1]
table_df.reset_index(drop=True, inplace=True)
table_df.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [11]:
combined_df = table_df.groupby('PostCode').agg({'Borough':lambda x: x.max(),'Neighbourhood':lambda x:', '.join(x)})
combined_df.reset_index(inplace=True)
combined_df.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [None]:

!conda install -c conda-forge geocoder --yes

In [9]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None
lat = []
long = []
for postal_code in combined_df.iloc[:,0]:
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng
    
    lat.append(lat_lng_coords[0])
    long.append(lat_lng_coords[1])
coords_df = pd.DataFrame(zip(lat,long), columns=['Latitude','Longitude'])

KeyboardInterrupt: 

As the package is very unreliable and does not work every time, we use a a link to a csv file that has the geographical coordinates of each postal code.

In [12]:

lat_lng_coords = pd.read_csv('http://cocl.us/Geospatial_data')
lat_lng_coords.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Next, we merge the coordinates table with our previously created table by using Postal Code as the key. As the name of two columns is different in the two tales we use left_on and right_on parameters while merging and then drop one of the columns.

In [13]:
lat_lng_coords = pd.read_csv('http://cocl.us/Geospatial_data')
lat_lng_coords.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Next, we merge the coordinates table with our previously created table by using Postal Code as the key. As the name of two columns is different in the two tales we use left_on and right_on parameters while merging and then drop one of the columns.

In [14]:
merged_df = pd.merge(combined_df, lat_lng_coords, left_on='PostCode', right_on='Postal Code')
merged_df.drop(columns=['Postal Code'], inplace=True)
merged_df.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [15]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(merged_df['Borough'].unique()),
        merged_df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [16]:
from geopy.geocoders import Nominatim
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [17]:

toronto_df = merged_df[merged_df['Borough'].str.find('Toronto')!=-1].reset_index(drop=True)
toronto_df.head(10)

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [18]:

import folium 
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# adding markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=False)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto