In [2]:
#import basic libraries
import pandas as pd
import numpy as np

#install plugin for web scraping
!pip install lxml html5lib beautifulsoup4
from bs4 import BeautifulSoup

#installing the geopy packages
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#importing k-means clustering package
from sklearn.cluster import KMeans

#importing matplotlib plotting packages
import matplotlib.cm as cm
import matplotlib.colors as colors

#importing folium package
!conda install -c conda-forge folium=0.5.0 --yes
import folium

#sldkhjnvk d
!pip install pandas==1.0.3

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/79/37/d420b7fdc9a550bd29b8cfeacff3b38502d9600b09d7dfae9a69e623b891/lxml-4.5.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 5.1MB/s eta 0:00:01
Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 33.5MB/s eta 0:00:01
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: lxml, soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.1 lxml-4.5.2 soupsieve-2.0.1
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8

In [3]:
#scraping wikipedia table
url = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = pd.read_html(url)

df = data[0]

type(df)

df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    103 non-null    object
 1   Borough        103 non-null    object
 2   Neighbourhood  103 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [5]:
#dropping boroughs 
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
#combining neighbourhoods by postal code and reset index 
df = df.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df.reset_index(inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
df.shape

(103, 3)

In [8]:
#importing lat_lng into df
lat_lng = pd.read_csv('Geospatial_Coordinates.csv')
lat_lng

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [9]:
#merge
neighborhoods = df.merge(lat_lng, on = 'Postal Code')
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [10]:
neighborhoods.describe(include = 'all')

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
count,103,103,103,103.0,103.0
unique,103,10,99,,
top,M4G,North York,Downsview,,
freq,1,24,4,,
mean,,,,43.704608,-79.397153
std,,,,0.052463,0.097146
min,,,,43.602414,-79.615819
25%,,,,43.660567,-79.464763
50%,,,,43.696948,-79.38879
75%,,,,43.74532,-79.340923


We will now examine Boroughs that DO NOT contain 'Toronto' in their name

In [11]:
#Creating new dataframe for Greater Toronto Area
df_GTA = neighborhoods[~neighborhoods['Borough'].str.contains('Toronto')].reset_index()
df_GTA

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
4,6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
...,...,...,...,...,...,...
59,94,M9W,Etobicoke,"Northwest, West Humber - Clairville",43.706748,-79.594054
60,95,M1X,Scarborough,Upper Rouge,43.836125,-79.205636
61,98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
62,101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [12]:
df_GTA = df_GTA.drop('index', 1)
df_GTA.head(8)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
4,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
5,M3B,North York,Don Mills,43.745906,-79.352188
6,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
7,M6B,North York,Glencairn,43.709577,-79.445073


In [13]:
lat_lng_to = df_GTA.drop(['Postal Code','Borough', 'Neighbourhood'], axis=1)

lat_lng_to

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.718518,-79.464763
3,43.667856,-79.532242
4,43.806686,-79.194353
...,...,...
59,43.706748,-79.594054
60,43.836125,-79.205636
61,43.653654,-79.506944
62,43.636258,-79.498509


We're going to find the avg coordinates for the map

In [18]:
GTA_lat = lat_lng_to['Latitude'].mean()
GTA_lat

43.7274426890625

In [19]:
GTA_lon = lat_lng_to['Longitude'].mean()
GTA_lon

-79.40158895625

In [16]:
df_GTA.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
3,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
4,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
5,M3B,North York,Don Mills,43.745906,-79.352188
6,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
7,M6B,North York,Glencairn,43.709577,-79.445073
8,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov...",43.650943,-79.554724
9,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497


In [20]:
#create map of GTA using averaged coordinates
map_GTA = folium.Map(location = [GTA_lat, GTA_lon], zoom_start = 11)

#Add markers for neighborhoods not containing 'Toronto'
for lat, lng, Borough, Neighbourhood in zip (df_GTA['Latitude'], df_GTA['Longitude'], df_GTA['Neighbourhood'], df_GTA['Borough']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='lightred',
        fill=True,
        fill_color='YlOrBr',
        fill_opacity=0.7,
        parse_html=False).add_to(map_GTA)
    
map_GTA

Clustering Neighbourhoods in the GTA based on proximity

In [21]:
#setting up kmeans
k = 5

k_means = KMeans(n_clusters = k, init = 'k-means++', n_init = 10)
k_means.fit(lat_lng_to)
labels = k_means.labels_

print(labels)

[2 2 1 3 4 0 2 1 3 4 2 2 1 3 4 1 4 0 4 0 0 2 4 0 1 2 2 0 1 2 0 1 1 1 2 0 1
 0 1 1 2 0 1 1 1 2 0 3 2 0 3 3 4 2 4 3 3 2 3 3 4 3 3 3]


In [22]:
#adding cluster labels back to df_Toronto
df_GTA['Cluster'] = labels

df_GTA.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,M3A,North York,Parkwoods,43.753259,-79.329656,2
1,M4A,North York,Victoria Village,43.725882,-79.315572,2
2,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1
3,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242,3
4,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4


In [23]:
# create map
map_GTA_clusters = folium.Map(location=[GTA_lat, GTA_lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_GTA['Latitude'], df_GTA['Longitude'], df_GTA['Neighbourhood'], df_GTA['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_GTA_clusters)
       
map_GTA_clusters