In [157]:
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import KMeans
from sklearn import preprocessing
import matplotlib.cm as cm
import matplotlib.colors as colors

scrape the data from wikipedia using `pd.read_html`

In [128]:
table = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df = table[0]

Extract the data and assign it to the dataframe `df`

Rename the columns of the dataframe, drop the first row as it is the label of the columns, not actual data entries

In [129]:
df.rename(columns = df.iloc[0], inplace = True)
df.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
df.drop(df.index[0], inplace = True)


Only keep the rows with Boroughs assigned

In [130]:
df = df.loc[df['Borough'] != 'Not assigned']

Combine rows with same postal code using `groupby`

In [131]:
df.groupby(['PostalCode']).agg({'Borough' :'first', 'Neighborhood':','.join}).reset_index()
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


Check the number of rows in the newly formed dataframe

In [132]:
df.shape[0]

103

Download geodata using wget

In [None]:
import wget
wget.download('https://cocl.us/Geospatial_data', 'geo_data.csv')

In [133]:
geo_data = pd.read_csv('geo_data.csv')
geo_data.rename(columns = {'Postal Code' :'PostalCode'}, inplace = True)

Merge geodata with the previous dataframe on the key postal code

In [135]:
df = pd.merge(df, geo_data, on = 'PostalCode')

In [136]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Use folium to show the coordinates of neighorhoods on a map of Toronto

In [143]:
latitude = 43.654
longitude = -79.361
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Prepare dataset for clustering based on geographic coordinates, here I use k = 5 clusters

In [147]:
x = df[['Latitude', 'Longitude']].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

In [149]:
KNN_model = KMeans(n_clusters = 5, random_state = 0).fit(x_scaled)

In [153]:
df['Cluster'] = KNN_model.labels_

Create a clour scheme for clusters

In [160]:
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

Show clusters on the map of toronto

In [163]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood, cluster in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood'], df['Cluster']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color= rainbow[cluster - 1],
        fill=True,
        fill_color= rainbow[cluster - 1],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto