# Segmenting and Clustering Neighborhoods in Toronto

`Author` : Stéphane Degeye  `Last Update` : 2020-02-18

### For practical reasons, this Notebook contains all parts of the assignment

In [1]:
pip install --upgrade pip

Requirement already up-to-date: pip in c:\users\admin\documents\steph\100_personal_research\ml\ml_env\lib\site-packages (20.0.2)
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html
from IPython.display import display
import folium
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


## Part I - Web Scraping and Data Wrangling

### *Web Scraping*

In [7]:
# Get HTML Page from Wikipedia website
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# S<Response [200]> expected
page 

<Response [200]>

In [8]:
# Parse and Get an ID for further processing
soup = BeautifulSoup(page.content, 'lxml')

In [9]:
# Overview of Page title
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [10]:
tab=str(soup.table)
display_html(tab,raw=True)

Postcode,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,Lawrence Heights
M6A,North York,Lawrence Manor
M7A,Downtown Toronto,Queen's Park
M8A,Not assigned,Not assigned
M9A,Queen's Park,Not assigned


In [11]:
PostCodeTable = soup.find('table', attrs={'class':'wikitable sortable'})

In [12]:
# Function to extract header row and data rows from table
def GetRows(table):       
    rows = []
    trs = table.find_all('tr')
    
    # header row
    headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] 
    
    if headerow: 
        rows.append(headerow)
        trs = trs[1:]
    
    # data rows
    for tr in trs: 
        rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
    
    return rows

In [13]:
PostCodeTableRows = GetRows(PostCodeTable)

In [14]:
dfPostCodeTable = pd.DataFrame(PostCodeTableRows[1:], columns=PostCodeTableRows[0])
dfPostCodeTable = dfPostCodeTable.rename(columns = {"Neighbourhood": "Neighborhood","Postcode": "PostalCode"}) 
dfPostCodeTable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### *Data Wrangling*

In [15]:
# Replace missing data by Numpy Standard "NaN"
dfPostCodeTable.replace("Not assigned",np.nan,inplace=True)
dfPostCodeTable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [16]:
# Drop all rows that have an unassigned "Borough"
dfPostCodeTable.dropna(subset=["Borough"],axis=0,inplace=True)
dfPostCodeTable.reset_index(drop=True, inplace=True)

In [17]:
# Init all unassigned "Neighborhood" with the "Borough" value
dfPostCodeTable["Neighborhood"].replace(np.nan,dfPostCodeTable["Borough"],inplace=True)

In [18]:
# Group by Postcode and generate a list of all existing "Neighborhood" separated by a comma  
dfGroup = dfPostCodeTable.groupby(['PostalCode','Borough'])['Neighborhood'].agg([('Neighborhood', ', '.join)],inplace=True).reset_index()
dfGroup

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [19]:
print('The dataframe shape is {}'.format(dfGroup.shape))

The dataframe shape is (103, 3)


## Part II - Data Merging with Coordinates (Latitude & Longitude)

In [20]:
# Read csv file with Postcode latitude ans longitude
dfCoord = pd.read_csv("Geospatial_Coordinates.csv")
dfCoord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### *Data Merging*

In [21]:
# Merge Postcode dataframe with the one that contains Coordinates
dfNewCoord = dfCoord.rename(columns = {"Postal Code": "PostalCode"}) 
dfResult = pd.merge(dfGroup, dfNewCoord, on='PostalCode')
dfResult

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


## Part III - Data Exploring, Clustering and Visualization

In [22]:
# Filter on "Toronto" 
dfToronto = dfResult[dfResult['Borough'].str.contains('Toronto',regex=False)].reset_index(drop=True)
dfToronto.head(150)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


### *Visualization of Toronto's neighborhoods*

In [23]:
# Let's get the geographical coordinates of Toronto
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [35]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfToronto['Latitude'], dfToronto['Longitude'], dfToronto['Borough'], dfToronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### *Explore First Neighborhood*

In [25]:
dfToronto.loc[0, 'Neighborhood']

'The Beaches'

In [26]:
neighborhood_latitude = dfToronto.loc[0, 'Latitude'] 
neighborhood_longitude = dfToronto.loc[0, 'Longitude'] 
neighborhood_name = dfToronto.loc[0, 'Neighborhood'] 

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name,neighborhood_latitude,neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


### *Clustering*

In [27]:
# set number of clusters
kclusters = 5

toronto_clustering = dfToronto.drop(['PostalCode','Borough','Neighborhood'],1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

In [28]:
# add clustering labels
dfToronto.insert(0, 'Cluster Labels', kmeans.labels_)

dfToronto.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,0,M4M,East Toronto,Studio District,43.659526,-79.340923
4,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfToronto['Latitude'], dfToronto['Longitude'], dfToronto['Neighborhood'], dfToronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### *Examine Clusters*

#### Cluster 1

In [30]:
dfToronto.loc[dfToronto['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,0,M4M,East Toronto,Studio District,43.659526,-79.340923
38,0,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


#### Cluster 2

In [31]:
dfToronto.loc[dfToronto['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
4,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,1,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,1,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,1,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,1,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
22,1,M5N,Central Toronto,Roselawn,43.711695,-79.416936
23,1,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307


#### Cluster 3

In [32]:
dfToronto.loc[dfToronto['Cluster Labels'] == 2]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
31,2,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
34,2,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
35,2,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
36,2,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445


#### Cluster 4

In [33]:
dfToronto.loc[dfToronto['Cluster Labels'] == 3]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
24,3,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
25,3,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
26,3,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049
30,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
32,3,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
33,3,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191


#### Cluster 5

In [34]:
dfToronto.loc[dfToronto['Cluster Labels'] == 4]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
10,4,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
11,4,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
12,4,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
13,4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
14,4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
16,4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
17,4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
18,4,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
19,4,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752
