# Segmenting and Clustering Neighborhoods in Toronto

First we import pandas and request
Then we get the data from URL and we remove the 'Not assigned' rows

In [2]:
import pandas as pd
import requests

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

dfr = pd.read_html(response.content, header=0)[0]
dfx = dfr[dfr.Borough != 'Not assigned']

dfx.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Then we search for rows with the same postcode and we concatenate the strings of Neighbourhood.
Then we create a new table starting from the old one and we drop duplicates.

In [3]:

df2 = dfx
label = ''
df3 = pd.DataFrame({'Postcode': [], 'Borough': [], 'Neighbourhood': []})
for index, row in df2.iterrows():
    label = ''
    ct = df2.loc[df2['Postcode'] == row["Postcode"]]
    

    #print(ct)
    i = 0
    for index, row in ct.iterrows():
        i += 1
        if (i > 1): label = label + ', '
        label = label +row['Neighbourhood']
#Postcode:  ct.iloc[0]['Postcode']
#Borough:  ct.iloc[0]['Borough']
#Neighbourhood: label
    df3 = df3.append({'Postcode': ct.iloc[0]['Postcode'], 'Borough': ct.iloc[0]['Borough'], 'Neighbourhood': label}, ignore_index=True)

df3.drop_duplicates(subset=None, keep='first', inplace=True)


df3.head()

Unnamed: 0,Borough,Neighbourhood,Postcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,"Harbourfront, Regent Park",M5A
4,North York,"Lawrence Heights, Lawrence Manor",M6A
6,Queen's Park,Not assigned,M7A


If we have Neighbourhood Not assigned, we change it with the value of Borough

In [4]:
import numpy as np
df3['Neighbourhood'] = np.where(df3['Neighbourhood']== 'Not assigned', df3['Borough'], df3['Neighbourhood'])
df3.head()


Unnamed: 0,Borough,Neighbourhood,Postcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,"Harbourfront, Regent Park",M5A
4,North York,"Lawrence Heights, Lawrence Manor",M6A
6,Queen's Park,Queen's Park,M7A


We use the shape function to display rows and columns numbers

In [5]:

print("We have %s columns" % (df3.shape[1]))
print("We have %s rows" % (df3.shape[0]))

We have 3 columns
We have 103 rows


We read the coordinates and then we inner join our dataframe with the coordinates

In [6]:
dfcoordinates = pd.read_csv("https://cocl.us/Geospatial_data")
df3 = df3.join(dfcoordinates.set_index('Postal Code'), on='Postcode')
df3.head()

Unnamed: 0,Borough,Neighbourhood,Postcode,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
4,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
6,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


We import libraires

In [7]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  42.34 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  34.99 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  38.71 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  51.29 MB/s
Libraries imported.


In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent ny_explorer, as shown below.

In [8]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create a map of New York with neighborhoods superimposed on top

In [9]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
neighborhoods = df3

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto
