# Clustering Toronto Neighborhood using K-Means Algorithm

Importing Essential Libraries

In [1]:
import numpy as np
import pandas as pd
import json 
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!pip install folium
import folium
from bs4 import BeautifulSoup
print('Libraries are imported')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Libraries are imported


# Part 1

Importing Data(Wikipedia Table)

In [2]:
data=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
data_soup = BeautifulSoup(data,'html5lib')

Creating Dataframe

In [3]:
table_contents=[]
table=data_soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)


df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [4]:
df.style

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Shape of the dataframe

In [5]:
df.shape

(103, 3)

# Part 2

Importing Data(Longitude-Latitude)

In [6]:
df_lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
df_lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging dataframe

In [7]:
df_lat_long.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df_toronto = pd.merge(df,df_lat_long,on='PostalCode')
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Part 3

In [8]:
df_toronto = df_toronto[df_toronto['Borough'].str.contains(pat='Toronto')]
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


Map of Toronto

In [9]:
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=12)

# add markers to map
for lat, long, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Applying K-Means Algorithm

In [10]:
kclusters = 5
toronto_grouped_clustering = df_toronto.drop(['PostalCode','Borough','Neighborhood'], 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10] 

array([0, 0, 0, 3, 0, 0, 2, 0, 4, 3], dtype=int32)

Final Dataframe

In [11]:
df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,3,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,2,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,4,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
35,3,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106


Map of Toronto after Clustering

In [12]:
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [13]:
df_toronto.loc[df_toronto['Cluster Labels'] == 0, df_toronto.columns[[1] + list(range(1, df_toronto.shape[1]))]]

Unnamed: 0,PostalCode,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
2,M5A,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
30,M5H,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
36,M5J,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
42,M5K,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
48,M5L,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817
87,M5V,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


In [14]:
df_toronto.loc[df_toronto['Cluster Labels'] == 1, df_toronto.columns[[1] + list(range(1, df_toronto.shape[1]))]]

Unnamed: 0,PostalCode,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
61,M4N,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
62,M5N,M5N,Central Toronto,Roselawn,43.711695,-79.416936
67,M4P,M4P,Central Toronto,Davisville North,43.712751,-79.390197
68,M5P,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
73,M4R,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
79,M4S,M4S,Central Toronto,Davisville,43.704324,-79.38879
83,M4T,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
86,M4V,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [15]:
df_toronto.loc[df_toronto['Cluster Labels'] == 2, df_toronto.columns[[1] + list(range(1, df_toronto.shape[1]))]]

Unnamed: 0,PostalCode,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
25,M6G,M6G,Downtown Toronto,Christie,43.669542,-79.422564
37,M6J,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975
43,M6K,M6K,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191
74,M5R,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
80,M5S,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
84,M5T,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049


In [16]:
df_toronto.loc[df_toronto['Cluster Labels'] == 3, df_toronto.columns[[1] + list(range(1, df_toronto.shape[1]))]]

Unnamed: 0,PostalCode,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
19,M4E,M4E,East Toronto,The Beaches,43.676357,-79.293031
35,M4J,M4J,East York/East Toronto,The Danforth East,43.685347,-79.338106
41,M4K,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
47,M4L,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
54,M4M,M4M,East Toronto,Studio District,43.659526,-79.340923
100,M7Y,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558


In [17]:
df_toronto.loc[df_toronto['Cluster Labels'] == 4, df_toronto.columns[[1] + list(range(1,df_toronto.shape[1]))]]

Unnamed: 0,PostalCode,PostalCode.1,Borough,Neighborhood,Latitude,Longitude
31,M6H,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259
69,M6P,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
75,M6R,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
81,M6S,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
