In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
import folium 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
print('Libraries imported.')

Libraries imported.


In [2]:
can_postal_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(can_postal_url).text

can_html  = BeautifulSoup(source, 'xml')
can_html = can_html.find('table')

'''
The structure of the return result will be like that:

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="UTF-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
.....
<table class="wikitable sortable">
.....
table content
.....
</table>
'''

'\nThe structure of the return result will be like that:\n\n<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n.....\n<table class="wikitable sortable">\n.....\ntable content\n.....\n</table>\n'

In [3]:
# Three columns of the table: PostalCode, Borough, and Neighborhood
col_names = ['Postalcode', 'Borough', 'Neighborhood']
can_df = pd.DataFrame(columns = col_names)

# Search all the postcode, borough, neighborhood 
for tr_cell in can_html.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        can_df.loc[len(can_df)] = row_data

can_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# Clone can_df to df
df = can_df

#  Clean NA cells and cells whose `Borough` is `Not assigned`
df = df.dropna()
df = df[df['Borough'] != 'Not assigned']

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough.
not_assigned_ids = df['Neighborhood'] == 'Not assigned'
df['Neighborhood'][not_assigned_ids] = df['Borough'][not_assigned_ids]

# Replace '/' by ','
df['Neighborhood'] = df['Neighborhood'].str.replace(' /', ',')
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
# Merge cells having the same postal code
temp = df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp = temp.reset_index(drop=False)
temp.rename(columns={'Neighborhood':'Neighborhood_joined'}, inplace=True)

temp.head()

Unnamed: 0,Postalcode,Neighborhood_joined
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [6]:
df_merge = pd.merge(df, temp, on='Postalcode')
df_merge.drop(['Neighborhood'], axis=1, inplace=True)
df_merge.drop_duplicates(inplace=True)
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'}, inplace=True)
df_merge.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df_merge.shape

(103, 3)

In [8]:
import geocoder
import folium

In [9]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.rename(columns={'Postal Code':'Postalcode'}, inplace=True)
geo_merge = pd.merge(geo_df, df_merge, on='Postalcode')
geo_merge.head(100)

toronto_df = geo_merge[geo_merge['Borough'].str.contains("Toronto")]
toronto_df.reset_index(drop=True, inplace=True)

print(toronto_df.head())

  Postalcode   Latitude  Longitude          Borough  \
0        M4E  43.676357 -79.293031     East Toronto   
1        M4K  43.679557 -79.352188     East Toronto   
2        M4L  43.668999 -79.315572     East Toronto   
3        M4M  43.659526 -79.340923     East Toronto   
4        M4N  43.728020 -79.388790  Central Toronto   

                     Neighborhood  
0                     The Beaches  
1    The Danforth West, Riverdale  
2  India Bazaar, The Beaches West  
3                 Studio District  
4                   Lawrence Park  


In [10]:
# Create Toronto map
map_toronto = folium.Map(location=[43.65, -79.38], zoom_start=10)

# Add markers
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto