## Libraries

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans 
import geocoder
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

## Preprocess

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page=requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [5]:
table = soup.find('table',{'class','wikitable sortable'})

In [6]:
Postal=[]
Borough=[]
Neighborhood=[]
count=0
for data in table.find_all('td'):
    if count%3==0:
        Postal.append(data.text)
    elif count%3==1:
        Borough.append(data.text)
    else:
        Neighborhood.append(data.text[:-1])
    count+=1

In [7]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']
FSA = pd.DataFrame(list(zip(Postal,Borough,Neighborhood)), columns=column_names)
FSA = FSA[FSA['Borough']!='Not assigned']
FSA

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [8]:
postal_update=[]
concateNeigh=[]
borough_update=[]
for P in FSA['PostalCode'].unique():
    newNeighbor=''
    for string in FSA[FSA['PostalCode']==P]['Neighborhood']:
        newNeighbor += string + ', '
    
    postal_update.append(P)
    borough_update.append(FSA[FSA['PostalCode']==P]['Borough'].unique()[0])
    if newNeighbor[:-2] == 'Not assigned':
        concateNeigh.append(borough_update[-1])
    else:
        concateNeigh.append(newNeighbor[:-2])
    

In [9]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']
FSA = pd.DataFrame(list(zip(postal_update,borough_update,concateNeigh)), columns=column_names)
FSA

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [10]:
urlgeo='http://cocl.us/Geospatial_data'
geo=pd.read_csv(urlgeo)
newFSA=FSA.merge(geo, how='left', left_on='PostalCode', right_on='Postal Code')[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]

In [11]:
YorkFSA=newFSA[newFSA['Borough'].str.contains('North York')]

In [12]:
YorkFSA

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
7,M3B,North York,Don Mills North,43.745906,-79.352188
10,M6B,North York,Glencairn,43.709577,-79.445073
13,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
27,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259
33,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
34,M3J,North York,"Northwood Park, York University",43.76798,-79.487262


In [13]:
subPost = ['M3L 1A3','M2N 6L8','M2R 3W9','M2M 3Y7','M6L 1A4','M2N 7K1','M9L 2Y8','M3K 1G7','M3H 1S8','M3J 2C7','M2H 3N3','M4A 1W7','M3N 2V3','M2M 3X4','M3K 1E1','M3M 2H7','M3J 3N3','M2M 3W2','M9M 2W8','M2J 3C1']
Borough = ['North York']*len(subPost)
subNeighborhood = ['n/a']*len(subPost)
subLon = [-79.510197,-79.413067,-79.44475,-79.385072,-79.475477,-79.409223,-79.548895,-79.480413,-79.435917,-79.469812,-79.3542,-79.303433,-79.51813,-79.422633,-79.462732,-79.486748,-79.473435,-79.417175,-79.542757,-79.351894]
subLat = [43.721237,43.768858,43.792464,43.805237,43.708619,43.762113,43.769354,43.727021,43.737186,43.767584,43.794258,43.722729,43.756279,43.796381,43.731799,43.744199,43.786262,43.789245,43.751821,43.785936]
subways = pd.DataFrame(list(zip(Borough,subLat,subLon,subNeighborhood,subPost)), columns=['Borough','Latitude','Longitude','Neighborhood','PostalCode'])


In [14]:
# YorkFSA = YorkFSA.append(subways)

In [15]:
YorkFSA

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
7,M3B,North York,Don Mills North,43.745906,-79.352188
10,M6B,North York,Glencairn,43.709577,-79.445073
13,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
27,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259
33,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
34,M3J,North York,"Northwood Park, York University",43.76798,-79.487262


## Clustering

In [16]:
from sklearn.preprocessing import StandardScaler

YorkFSA_norm = StandardScaler().fit_transform((YorkFSA[['Latitude','Longitude']]))
YorkFSA_norm

array([[ 0.09984809,  1.45436612],
       [-0.98007378,  1.65986598],
       [-1.27057573, -0.51685333],
       [-0.19020022,  1.12562966],
       [-1.62327429, -0.22956438],
       [-0.9793874 ,  1.28998695],
       [ 2.09208013,  0.96129134],
       [ 0.1420449 , -0.1885181 ],
       [ 1.09624419,  1.20780538],
       [ 0.68057981, -0.84510978],
       [ 1.42877727,  0.63267452],
       [-0.52284375, -0.51685333],
       [ 0.2667734 ,  0.79697199],
       [-0.46203964, -1.13226742],
       [-1.45841602, -0.88613563],
       [ 0.21995337, -1.99336975],
       [ 1.5118415 ,  0.30413794],
       [-0.87695452, -0.96818442],
       [-0.68815567,  0.13989884],
       [-1.02411278, -1.50137901],
       [ 0.76498131,  0.30413794],
       [ 0.43012873, -1.33734271],
       [ 0.08011259,  0.42732931],
       [ 1.26266852, -0.1885181 ]])

In [17]:
kclusters=4
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(YorkFSA_norm)

In [18]:
YorkFSA.insert(0, 'Cluster Labels', kmeans.labels_)

In [19]:
YorkFSA

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,0,M4A,North York,Victoria Village,43.725882,-79.315572
3,1,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
7,0,M3B,North York,Don Mills North,43.745906,-79.352188
10,1,M6B,North York,Glencairn,43.709577,-79.445073
13,0,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
27,2,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,3,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259
33,2,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
34,3,M3J,North York,"Northwood Park, York University",43.76798,-79.487262


## Create Map

In [26]:
# create map

map_clusters = folium.Map(location=[43.6595255,-79.293031], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)+2))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(YorkFSA['Latitude'], YorkFSA['Longitude'], YorkFSA['Neighborhood'], YorkFSA['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
for lat, lon, poi in zip(subways['Latitude'], subways['Longitude'], subways['Neighborhood']):
    folium.Circle(
        [lat, lon],
        radius=3,
        color=rainbow[4],
        fill=True,
        fill_color=rainbow[4],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters