In [209]:
import pandas as pd
import numpy as np
import geocoder
import requests
from bs4 import BeautifulSoup

# Pre-Processing

Gettign the data

In [210]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]

 

In [211]:
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [212]:
df.shape

(287, 3)

Checking if some postcodes do appear multiple times as described

In [213]:
df['Postcode'].value_counts()

M8Y    8
M9V    8
M5V    7
M8Z    5
M4V    5
      ..
M1G    1
M2R    1
M6G    1
M8B    1
M2W    1
Name: Postcode, Length: 180, dtype: int64

We see that it is indeed the case

So let's group the neighbourhoods by Postcode and Borough

In [214]:
df_grouped=df.groupby(['Postcode','Borough'])['Neighbourhood'].agg(', '.join)
df_grouped.head()

Postcode  Borough     
M1A       Not assigned                              Not assigned
M1B       Scarborough                             Rouge, Malvern
M1C       Scarborough     Highland Creek, Rouge Hill, Port Union
M1E       Scarborough          Guildwood, Morningside, West Hill
M1G       Scarborough                                     Woburn
Name: Neighbourhood, dtype: object

The groupby method **converts** the dataframe into a *Pandas Series*, **so let's convert it back** to a *Pandas DataFrame*.

(There is probably a better way to do this transformation)

In [215]:
df=pd.DataFrame(df_grouped)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn


We want to remove the lines wit no assigned Borough nor Neighbourhood. Let's create a concatenation of those two columns to better identify those lines

In [216]:
df['Borough-Neigh']=df['Borough']+', '+df['Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Borough-Neigh
0,M1A,Not assigned,Not assigned,"Not assigned, Not assigned"
1,M1B,Scarborough,"Rouge, Malvern","Scarborough, Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union","Scarborough, Highland Creek, Rouge Hill, Port ..."
3,M1E,Scarborough,"Guildwood, Morningside, West Hill","Scarborough, Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn,"Scarborough, Woburn"


Now we can remove the Postcodes with no Boroughs nor Neighbourhood

In [217]:
df=df[df['Borough-Neigh']!= 'Not assigned, Not assigned']
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Borough-Neigh
1,M1B,Scarborough,"Rouge, Malvern","Scarborough, Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union","Scarborough, Highland Creek, Rouge Hill, Port ..."
3,M1E,Scarborough,"Guildwood, Morningside, West Hill","Scarborough, Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn,"Scarborough, Woburn"
5,M1H,Scarborough,Cedarbrae,"Scarborough, Cedarbrae"
6,M1J,Scarborough,Scarborough Village,"Scarborough, Scarborough Village"
7,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park","Scarborough, East Birchmount Park, Ionview, Ke..."
8,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge","Scarborough, Clairlea, Golden Mile, Oakridge"
9,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West","Scarborough, Cliffcrest, Cliffside, Scarboroug..."
10,M1N,Scarborough,"Birch Cliff, Cliffside West","Scarborough, Birch Cliff, Cliffside West"


Now, let's drop the artificial column

In [218]:
df.drop(columns=['Borough-Neigh'],inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [219]:
df.shape

(103, 3)

In [220]:
df_coordinates=pd.read_csv('http://cocl.us/Geospatial_data')
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [221]:
df=df.merge(right=df_coordinates, how='left', left_on='Postcode', right_on='Postal Code')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [222]:
import folium
import random
from random import randint
from colormap import rgb2hex
from sklearn.cluster import KMeans 

We will build 10 clusters based on the distance between neighbourhoods. 

In [223]:
k_means = KMeans(init = "random", n_clusters = 10, n_init = 20)
X=df[['Latitude','Longitude']]
k_means.fit(X)

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
       n_clusters=10, n_init=20, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [224]:
k_means_labels = k_means.labels_
k_means_labels

array([0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 9, 0, 9, 9, 9, 0, 9, 9, 8, 8, 8,
       8, 8, 8, 9, 9, 2, 8, 1, 5, 1, 1, 1, 2, 2, 2, 2, 6, 2, 2, 2, 2, 2,
       6, 6, 6, 6, 6, 6, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 6, 6, 3,
       3, 3, 3, 3, 3, 5, 5, 5, 5, 3, 5, 3, 3, 5, 5, 5, 5, 5, 5, 3, 4, 2,
       4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1])

In [225]:
type(k_means_labels[0])

numpy.int32

In [226]:
k_means_cluster_centers = k_means.cluster_centers_
k_means_cluster_centers

array([[ 43.78989244, -79.20966014],
       [ 43.72876881, -79.53606859],
       [ 43.69012922, -79.32687005],
       [ 43.65310797, -79.38912142],
       [ 43.63623247, -79.53902706],
       [ 43.68813134, -79.46312076],
       [ 43.70337314, -79.39258892],
       [ 43.72502628, -79.26061848],
       [ 43.76583949, -79.41024904],
       [ 43.77849131, -79.32437112]])

Let's prepare some variables to make our plots more readable

First, let's compute the cluster sizes to reflect it on the Map

In [227]:
k_means_cluster_size=[0]*10
k_means_cluster_size

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [228]:
k_means_cluster_size=[0]*10
for cluster in range (0,10):
    for label in k_means_labels :
        if label==cluster:
            k_means_cluster_size[cluster]+=1
k_means_cluster_size

[7, 11, 11, 21, 9, 12, 10, 6, 8, 8]

Now let's generate random colors, one for each cluster

In [229]:
colors=list()
for i in range(10):
    rgb=[0,0,0]
    for j in range(3):
        rgb[j]=randint(1,255)
    colors.append(rgb2hex(*tuple(rgb)))
colors

['#38999A',
 '#FB5983',
 '#EEFB8F',
 '#079187',
 '#CBE323',
 '#8ECC8D',
 '#EC088F',
 '#A82538',
 '#7AB532',
 '#A4AD9F']

Generating the map of Toronto

In [232]:
toronto_coordinates=[43.6532,-79.3832]
m=folium.Map(toronto_coordinates)


Adding the cluster centers into the Map, the clusters are represented by circles whose radius are proportional to their sizes

In [231]:
for i,centroid in enumerate(k_means_cluster_centers) :
    random.seed()
    folium.CircleMarker(location=centroid, radius=k_means_cluster_size[i]*1.2, color=colors[i], fill_color=colors[i], popup='Cluster '+str(i)).add_to(m)
m