<h4> Import the Libraries </h4>

In [202]:
import requests
from bs4 import BeautifulSoup

import numpy as np 
import pandas as pd
import folium 

from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

<h4> Import the Website and convert it to a lxml </h4>

In [166]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url, 'lxml')

<h4> Get the required table and read it to a dataframe </h4>

In [168]:
table = soup.find_all('table')[0] 
df_tor = pd.read_html(str(table))[0]
df_tor.columns = ['Postal Code', 'Borough', 'Neighbourhood']
df_tor = df_tor.iloc[1:]
df_tor[df_tor == 'Not assigned'] = np.nan

<h4> Covert desired datatype to string </h4>

In [169]:
df_tor['Postal Code'] = df_tor['Postal Code'].astype('str')

<h4> Remove the missing values in Borough column </h4>

In [170]:
df_tor = df_tor[df_tor['Borough'].notnull()]

<h4> Assign missing values in Neighbourhood column as the Borough </h4>

In [171]:
df_tor.loc[df_tor['Neighbourhood'].isnull(), 'Neighbourhood'] = df_tor['Borough']

<h4> Group the rows with the same Postal Code </h4>

In [172]:
df_tor = df_tor.groupby('Postal Code').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join}).reset_index()
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h4> Number of Rows in the DataFrame </h4>

In [173]:
df_tor.shape[0]

103

<h4> Import the Locations CSV </h4>

In [174]:
locations = pd.read_csv('Geospatial_Coordinates.csv')
locations['Postal Code'] = locations['Postal Code'].astype(str)
locations['Latitude'] = locations['Latitude'].astype(float)
locations['Longitude'] = locations['Longitude'].astype(float)
locations.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h4> Merge the two tables  </h4>

In [175]:
df_tor = df_tor.merge(right=locations, how='left', on='Postal Code')
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [165]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [178]:
# create map of New York using latitude and longitude values
map_tr = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Borough'], df_tor['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tr)  

In [197]:
toronto_data = df_tor
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [205]:
# create map of Manhattan using latitude and longitude values
map_etr = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etr)  
    
map_etr

In [204]:
# set number of clusters
kclusters = 5

df_tor_cluster = df_tor.drop('Neighbourhood', 1)

le = LabelEncoder()
le2 = LabelEncoder()
df_tor_cluster['Postal Code'] = le.fit_transform(df_tor_cluster['Postal Code'])
df_tor_cluster['Borough'] = le.fit_transform(df_tor_cluster['Borough'])
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_tor_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)