<h4> Import the Libraries </h4>

In [123]:
import requests
from bs4 import BeautifulSoup

import numpy as np 
import pandas as pd
import folium 

from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

<h4> Import the Website and convert it to a lxml </h4>

In [135]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url, 'lxml')

<h4> Get the required table and read it to a dataframe </h4>

In [136]:
table = soup.find_all('table')[0] 
df_tor = pd.read_html(str(table))[0]
df_tor.columns = ['Postal Code', 'Borough', 'Neighbourhood']
df_tor = df_tor.iloc[1:]
df_tor[df_tor == 'Not assigned'] = np.nan

<h4> Covert desired datatype to string </h4>

In [143]:
df_tor['Postal Code'] = df_tor['Postal Code'].astype('str')

<h4> Remove the missing values in Borough column </h4>

In [137]:
df_tor = df_tor[df_tor['Borough'].notnull()]

<h4> Assign missing values in Neighbourhood column as the Borough </h4>

In [138]:
df_tor.loc[df_tor['Neighbourhood'].isnull(), 'Neighbourhood'] = df_tor['Borough']

<h4> Group the rows with the same Postal Code </h4>

In [139]:
df_tor = df_tor.groupby('Postal Code').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join}).reset_index()
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h4> Number of Rows in the DataFrame </h4>

In [140]:
df_tor.shape[0]

103

<h4> Import the Locations CSV </h4>

In [153]:
locations = pd.read_csv('Geospatial_Coordinates.csv')
locations['Postal Code'] = locations['Postal Code'].astype(str)
locations['Latitude'] = locations['Latitude'].astype(float)
locations['Longitude'] = locations['Longitude'].astype(float)
locations.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h4> Merge the two tables  </h4>

In [158]:
df_tor = df_tor.merge(right=locations, how='left', on='Postal Code')
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
