## Importing the libraries

In [149]:
### install libraries if you don't have them (uncomment if needed)
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib

# to work with html pages
import requests
from bs4 import BeautifulSoup
# to work with dataframes
import pandas as pd
# to work with numerical data
import numpy as np
# to plot the map
import folium
from folium.plugins import MarkerCluster
# to perform clustering
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors



## Import the table and tranform it to the dataframe

In [150]:
# webpage url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# get an html of the page
website_url = requests.get(url).text
# get an html data
soup = BeautifulSoup(website_url,'lxml')

# get a table out of the page
table = soup.find('table',{'class':'wikitable sortable'})

# extract the rows which are defined by tr identifyer
table_rows = table.find_all('tr')
# get the elements
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    l.append(row)
# save to the data frame
df=pd.DataFrame(l[1:], columns=["Postal Code", "Borough","Neighbourhood"])

## Modify the data of the dataframe

In [151]:
# remove rows where 'Borough' = 'Not assigned'
df.drop(df.loc[df['Borough']=='Not assigned'].index,inplace = True)
# reset the row labelling
df.reset_index(drop=True, inplace=True)
# merge the neighbourhoods with the same postcodes
df['Neighbourhood'] =df.groupby('Postal Code')['Neighbourhood'].transform(lambda x: ', '.join(x))
# drop duplicated rows
df = df.drop_duplicates()
# replacing 'Not assigned' values in 'Neighbourhood' to the values of 'Borough'
df['Neighbourhood'][df.loc[df['Neighbourhood']=='Not assigned'].index]=df['Borough'][df.loc[df['Neighbourhood']=='Not assigned'].index]

In [152]:
# retrieve the shape of the final dataframe
df.shape

(103, 3)

## This block is to download locations

In [153]:
# url to load locations
url_loc='https://cocl.us/Geospatial_data'
# create another dataframe with locations
df_loc=pd.read_csv(url_loc)
# add locations to the main dataframe based on common key 'Postal Code'
df=df.join(df_loc.set_index('Postal Code'), on='Postal Code')

## Prepare the data for clustering based on Borough

In [154]:
# create a dictionary with Borough names
for idx, val in enumerate(df['Borough'].unique()):
    d[val] = idx
# copy a dataframe for clustering
df_grouped=df
# replace 'Borough' with numbers
df_grouped.Borough = [d[item] for item in df_grouped.Borough]
# drop columns 'Postal Code','Neighbourhood'
df_grouped.drop(columns=['Postal Code','Neighbourhood'], inplace=True)

## Perform clustering

In [155]:
# set number of clusters to unique names of 'Borough'
kclusters = len(df.Borough.unique())

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

# add clustering labels
df['Cluster Labels']=kmeans.labels_

## Create the map and mark different clusters with different colors

In [156]:
# create map
map_clusters = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters