# Segmenting and Clustering Neighborhood in Toronto

### Part 1: Prepare dataframe fetching data from webpage table

In [36]:
# import libraries
import requests
import lxml.html as lh
import pandas as pd

In [37]:
# setting url for the source data
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
# Create a handle, page, to handle the contents of the website
page = requests.get(url)
# Store the contents of the website under doc
doc = lh.fromstring(page.content)
# Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

##### For sanity check, ensure that all the rows have the same width. If not, we probably got something more than just the table.

In [38]:
# Check the length of the first 12 rows (Sanity Check)
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [39]:
# Create empty list
col=[]
i=0
# For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"' %(i,name))
    col.append((name,[]))

1:"Postal Code
"
2:"Borough
"
3:"Neighbourhood
"


In [40]:
# Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T  is our j'th row
    T=tr_elements[j]
    
    # If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    # i is the index of our column
    i=0
    
    # Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        # Check if row is empty
        if i>0:
        # Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        # Append the data to the empty list of the i'th column
        col[i][1].append(data)
        # Increment i for the next column
        i+=1

##### Just to be sure, let’s check the length of each column. Ideally, they should all be the same.

In [41]:
[len(C) for (title,C) in col]

[181, 181, 181]

##### Create the DataFrame

In [42]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

##### Looking at the top 5 cells on the DataFrame

In [43]:
df.head()

Unnamed: 0,Postal Code\n,Borough\n,Neighbourhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


##### Renaming column headings to remove \n and to phrase them as needed

In [44]:
df.rename(columns={"Postal Code\n": "Postal Code", "Borough\n": "Borough", "Neighbourhood\n": "Neighborhood"}, inplace=True)


In [45]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


##### Ignore cells with a borough that is 'Not assigned'

In [46]:
df = df[~df['Borough'].str.contains('Not assigned')]

In [47]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
5,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
6,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"
8,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village\n"
9,M1B\n,Scarborough\n,"Malvern, Rouge\n"
11,M3B\n,North York\n,Don Mills\n
12,M4B\n,East York\n,"Parkview Hill, Woodbine Gardens\n"
13,M5B\n,Downtown Toronto\n,"Garden District, Ryerson\n"


##### Remove last undesired row

In [48]:
df = df[:-1]

In [49]:
df.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
160,M8X\n,Etobicoke\n,"The Kingsway, Montgomery Road, Old Mill North\n"
165,M4Y\n,Downtown Toronto\n,Church and Wellesley\n
168,M7Y\n,East Toronto\n,"Business reply mail Processing Centre, South C..."
169,M8Y\n,Etobicoke\n,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


##### Remove \n from all rows

In [50]:
df_final = df.replace("\n", "", regex=True)

In [51]:
df_final.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


##### Restting the index to modified dataframe

In [52]:
df_final.reset_index(inplace=True, drop=True)

In [53]:
df_final.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


##### Checking shape and value counts to ensure no duplicates of Postal Codes

In [90]:
df_final['Postal Code'].value_counts()

M4K    1
M4M    1
M8Y    1
M6P    1
M5C    1
M4S    1
M6A    1
M1H    1
M2N    1
M5V    1
M5T    1
M6C    1
M5X    1
M8V    1
M9C    1
M1W    1
M4X    1
M6J    1
M2L    1
M1K    1
M5L    1
M2J    1
M4W    1
M4E    1
M6H    1
M3L    1
M2K    1
M9A    1
M3H    1
M5E    1
M9L    1
M6N    1
M6E    1
M3B    1
M9M    1
M4J    1
M3K    1
M2M    1
M4G    1
M5P    1
M4V    1
M7A    1
M1B    1
M1G    1
M3N    1
M9B    1
M1T    1
M4Y    1
M4L    1
M7R    1
M9W    1
M6L    1
M5R    1
M3J    1
M5J    1
M4H    1
M5H    1
M4N    1
M1J    1
M1E    1
M8W    1
M4B    1
M5K    1
M5A    1
M6S    1
M1S    1
M4P    1
M5G    1
M1X    1
M3C    1
M1M    1
M3A    1
M5B    1
M1C    1
M9R    1
M8Z    1
M4C    1
M9N    1
M1L    1
M9V    1
M7Y    1
M5M    1
M9P    1
M2R    1
M1V    1
M8X    1
M1N    1
M6B    1
M4R    1
M6K    1
M5N    1
M5W    1
M2P    1
M6M    1
M5S    1
M2H    1
M4A    1
M1P    1
M6G    1
M4T    1
M3M    1
M1R    1
M6R    1
Name: Postal Code, dtype: int64

In [55]:
df_final.shape

(103, 3)

#### Since Value Counts are equal to number of rows (103), there are no duplicate Postal Codes, for which 'Neighborhood' needs to be combined.

### ----- End of Part-1 -----

### Part -2 : Create Dataframe of Geospacial Data

In [56]:
import pandas as pd

In [57]:
# Create a dataframe from csv file
df_gsc = pd.read_csv("Geospatial_Coordinates.csv")

In [58]:
# display top 5 rows
df_gsc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [59]:
# dimensions of the dataframe
df_gsc.shape

(103, 3)

In [60]:
# set common index on both the dataframes
df_final.set_index('Postal Code')

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"
M3B,North York,Don Mills
M4B,East York,"Parkview Hill, Woodbine Gardens"
M5B,Downtown Toronto,"Garden District, Ryerson"


In [61]:
# set common index on both the dataframes
df_gsc.set_index('Postal Code')

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
M1J,43.744734,-79.239476
M1K,43.727929,-79.262029
M1L,43.711112,-79.284577
M1M,43.716316,-79.239476
M1N,43.692657,-79.264848


In [62]:
# merge both the dataframes in a third dataframe
df_merged = df_final.merge(df_gsc, on="Postal Code", left_index=True)

In [63]:
# display top 12 rows
df_merged.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
25,M3A,North York,Parkwoods,43.753259,-79.329656
34,M4A,North York,Victoria Village,43.725882,-79.315572
53,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
71,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
85,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
93,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
26,M3B,North York,Don Mills,43.745906,-79.352188
35,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
54,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### ----- End of Part-2 -----

### Part-3: Exploring and clustering neighborhoods in Toronto

## Create a map of Toronto with neighborhoods superimposed on top.

### Work with only boroughs that contain the word "Toronto"

In [64]:
df_merged = df_merged[df_merged['Borough'].str.contains('Toronto')]

In [65]:
df_merged.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
53,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
85,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
54,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
55,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
75,M6G,Downtown Toronto,Christie,43.669542,-79.422564
58,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
76,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [66]:
df_merged.shape

(39, 5)

#### Folium and Geopy are downloaded separately from Anaconda cloud

In [67]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [68]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_merged['Borough'].unique()),
        df_merged.shape[0]
    )
)

The dataframe has 4 boroughs and 39 neighborhoods.


In [69]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [70]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [71]:
df_merged.reset_index(inplace=True, drop=True)

In [72]:
df_merged.head(39)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [73]:
df_merged.dtypes

Postal Code      object
Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

In [74]:
df_merged.loc[0, 'Neighborhood']

'Regent Park, Harbourfront'

In [75]:
neighborhood_latitude = df_merged.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_merged.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_merged.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


##### dropped 'Neighborhood' column just to reduce number of records to be processed, 'Postal Code' can be used instead.

In [76]:
df_merged_wo = df_merged.drop('Neighborhood', 1)

In [77]:
df_merged_wo.head()

Unnamed: 0,Postal Code,Borough,Latitude,Longitude
0,M5A,Downtown Toronto,43.65426,-79.360636
1,M7A,Downtown Toronto,43.662301,-79.389494
2,M5B,Downtown Toronto,43.657162,-79.378937
3,M5C,Downtown Toronto,43.651494,-79.375418
4,M4E,East Toronto,43.676357,-79.293031


In [78]:
# one hot encoding
df_onehot = pd.get_dummies(df_merged, columns=["Borough", "Postal Code"], prefix="")

In [79]:
df_onehot.shape

(39, 46)

In [80]:
df_onehot.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,_Central Toronto,_Downtown Toronto,_East Toronto,_West Toronto,_M4E,_M4K,_M4L,_M4M,_M4N,_M4P,_M4R,_M4S,_M4T,_M4V,_M4W,_M4X,_M4Y,_M5A,_M5B,_M5C,_M5E,_M5G,_M5H,_M5J,_M5K,_M5L,_M5N,_M5P,_M5R,_M5S,_M5T,_M5V,_M5W,_M5X,_M6G,_M6H,_M6J,_M6K,_M6P,_M6R,_M6S,_M7A,_M7Y
0,"Regent Park, Harbourfront",43.65426,-79.360636,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,"Garden District, Ryerson",43.657162,-79.378937,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,St. James Town,43.651494,-79.375418,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Beaches,43.676357,-79.293031,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [83]:
df_onehot.drop('Neighborhood', 1, inplace=True)

In [84]:
# set number of clusters
kclusters = 4


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_onehot)
# k_means = KMeans(init="k-means++", n_clusters=5, n_init=12)
# k_means.fit(df_merged_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
# k_means_labels = k_means.labels_
# k_means_labels

array([0, 0, 0, 0, 2, 0, 0, 0, 0, 1])

In [85]:
# add clustering labels
df_onehot.insert(0, 'Cluster Labels', kmeans.labels_)

In [86]:
df_onehot.head()

Unnamed: 0,Cluster Labels,Latitude,Longitude,_Central Toronto,_Downtown Toronto,_East Toronto,_West Toronto,_M4E,_M4K,_M4L,_M4M,_M4N,_M4P,_M4R,_M4S,_M4T,_M4V,_M4W,_M4X,_M4Y,_M5A,_M5B,_M5C,_M5E,_M5G,_M5H,_M5J,_M5K,_M5L,_M5N,_M5P,_M5R,_M5S,_M5T,_M5V,_M5W,_M5X,_M6G,_M6H,_M6J,_M6K,_M6P,_M6R,_M6S,_M7A,_M7Y
0,0,43.65426,-79.360636,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,43.662301,-79.389494,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,43.657162,-79.378937,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,43.651494,-79.375418,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,43.676357,-79.293031,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [87]:
df_onehot.shape

(39, 46)

In [88]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(df_onehot['Latitude'], df_onehot['Longitude'], df_onehot['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### ---- End of Part-3 -----