# We will extract the table from the wikipedia page and filter the rows with the Not assigned borough

In [1]:
from bs4 import BeautifulSoup

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#### Getting the response from the wiki page using the requests package

In [3]:
import requests
response = requests.get(wiki_url)

In [4]:
data = response.text

## Initialize the Beautiful Soup instance to query the page

In [5]:
soup = BeautifulSoup(data)

#### Getting the first table body

In [6]:
table = soup.find("tbody")

#### Here we will iterate over the rows and extracting the postal code, borough and neighborhood
#### We will do the following in order to get the desired dataframe:
1. Ignore the rows that are Not assigned to any borough
2. Assign each value of the row to the correct list

In [7]:
postal_codes = []
boroughs = []
neigborhoods = []

for row in table:
    if row in ["\n", None]:
        continue
        
    heads = row.find_all("td")
    if not heads:
        continue
        
    borough = heads[1].text.strip()
    if borough == "Not assigned":
        continue

    postal_code = heads[0].text.strip()
    postal_codes.append(postal_code)
    boroughs.append(borough)
    _neigborhoods = heads[2].text.strip().replace("/ ", ",")
    neigborhoods.append(_neigborhoods)
    
    print("postal_code: {}, borough: {}, neigborhood: {}".format(postal_code, borough, _neigborhoods))
    print("-"*20)

postal_code: M3A, borough: North York, neigborhood: Parkwoods
--------------------
postal_code: M4A, borough: North York, neigborhood: Victoria Village
--------------------
postal_code: M5A, borough: Downtown Toronto, neigborhood: Regent Park ,Harbourfront
--------------------
postal_code: M6A, borough: North York, neigborhood: Lawrence Manor ,Lawrence Heights
--------------------
postal_code: M7A, borough: Downtown Toronto, neigborhood: Queen's Park ,Ontario Provincial Government
--------------------
postal_code: M9A, borough: Etobicoke, neigborhood: Islington Avenue
--------------------
postal_code: M1B, borough: Scarborough, neigborhood: Malvern ,Rouge
--------------------
postal_code: M3B, borough: North York, neigborhood: Don Mills
--------------------
postal_code: M4B, borough: East York, neigborhood: Parkview Hill ,Woodbine Gardens
--------------------
postal_code: M5B, borough: Downtown Toronto, neigborhood: Garden District ,Ryerson
--------------------
postal_code: M6B, boroug

### Construct the dataframe with the lists

In [8]:
import pandas as pd
frame = {
    "PostalCode": postal_codes,
    "Borough": boroughs,
    "Neighborhood": neigborhoods
}

In [9]:
df = pd.DataFrame(frame)

In [10]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park ,Harbourfront"
3,M6A,North York,"Lawrence Manor ,Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park ,Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern ,Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill ,Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District ,Ryerson"


In [11]:
df.shape

(103, 3)

#### We will use the CSV file to get the latitude and longitude for every postal code

In [12]:
df_csv = pd.read_csv("Geospatial_Coordinates.csv")

##### renaming the Postal Code into PostalCode in order to join the dataframes on this axis

In [13]:
df_csv.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

In [14]:
merged_df = pd.merge(df, df_csv, on="PostalCode")

In [15]:
merged_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park ,Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor ,Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park ,Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern ,Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill ,Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District ,Ryerson",43.657162,-79.378937


In [16]:
merged_df.shape

(103, 5)

# We will filter the rows with the Toronto surfix

In [40]:
_merged_df = merged_df.query('Borough.str.contains("Toronto")', engine='python')
_merged_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park ,Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park ,Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District ,Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond ,Adelaide ,King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin ,Dovercourt Village",43.669005,-79.442259


In [41]:
_merged_df.shape

(39, 5)

### setting the number of clusters and dropping the columns for fitting the kmeans model

In [84]:
# set number of clusters
kclusters = len(_merged_df['Borough'].unique())
print("We got {} clusters".format(kclusters))

_merged_df_dropped = _merged_df.drop(['PostalCode', "Neighborhood"], 1)
_merged_df_dropped.head()

We got 4 clusters


Unnamed: 0,Borough,Latitude,Longitude
2,Downtown Toronto,43.65426,-79.360636
4,Downtown Toronto,43.662301,-79.389494
9,Downtown Toronto,43.657162,-79.378937
15,Downtown Toronto,43.651494,-79.375418
19,East Toronto,43.676357,-79.293031


In [47]:
latitude = 43.657162
longitude = -79.378937

In [45]:
#!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### getting the categorical values for the Borough Column

In [67]:
#_merged_df['Borough'].value_counts()
_merged_df_dropped_dummies = pd.get_dummies(_merged_df_dropped[['Borough']], prefix="", prefix_sep="")
_merged_df_dropped_dummies

Unnamed: 0,Central Toronto,Downtown Toronto,East Toronto,West Toronto
2,0,1,0,0
4,0,1,0,0
9,0,1,0,0
15,0,1,0,0
19,0,0,1,0
20,0,1,0,0
24,0,1,0,0
25,0,1,0,0
30,0,1,0,0
31,0,0,0,1


fitting the Kmeans model

In [68]:
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters=4, random_state=0).fit(_merged_df_dropped_dummies)

In [69]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 2, 3, 0, 2, 3, 0, 3, 1, 1, 1, 1,
       2, 1, 1, 2, 1, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 3], dtype=int32)

#### insert the Borough column back for the map

In [70]:
_merged_df_dropped_dummies.insert(loc=0, column='Borough', value=_merged_df_dropped['Borough'])

#### insert the Latitude and Longitude also

In [72]:
_merged_df_dropped_dummies.insert(loc=5, column='Latitude', value=_merged_df_dropped['Latitude'])
#Longitude

In [73]:
_merged_df_dropped_dummies.insert(loc=5, column='Longitude', value=_merged_df_dropped['Longitude'])

In [74]:
_merged_df_dropped_dummies

Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,West Toronto,Longitude,Latitude
2,Downtown Toronto,0,1,0,0,-79.360636,43.65426
4,Downtown Toronto,0,1,0,0,-79.389494,43.662301
9,Downtown Toronto,0,1,0,0,-79.378937,43.657162
15,Downtown Toronto,0,1,0,0,-79.375418,43.651494
19,East Toronto,0,0,1,0,-79.293031,43.676357
20,Downtown Toronto,0,1,0,0,-79.373306,43.644771
24,Downtown Toronto,0,1,0,0,-79.387383,43.657952
25,Downtown Toronto,0,1,0,0,-79.422564,43.669542
30,Downtown Toronto,0,1,0,0,-79.384568,43.650571
31,West Toronto,0,0,0,1,-79.442259,43.669005


#### make the map with the clusters

In [80]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

rainbow = ['red', 'blue', 'green', 'purple']
def get_color_index(bor):
    return {
        "Central Toronto": 0,
        "Downtown Toronto": 1,
        "East Toronto": 2,
        "West Toronto": 3
    }.get(bor, "")

index = 0
for lat, lon, cluster in zip(_merged_df_dropped_dummies['Latitude'], _merged_df_dropped_dummies['Longitude'], _merged_df_dropped_dummies['Borough']):
    label = folium.Popup(str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[get_color_index(cluster)],
        fill=True,
        fill_color=rainbow[get_color_index(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
    index += 1
map_clusters