# Segmenting and Clustering Neighborhoods in Toronto

In [338]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

##  Obtain and Preprocess Postal Codes 

In order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe, I firstly scraped the following Wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M.

In [346]:
# parse html
source = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_text = requests.get(source).text

soup = BeautifulSoup(wiki_text)
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

In [504]:
# get the table data for dataframe 
table = soup.find('table', class_='wikitable sortable')
d = []

for tr in table.find_all('tr')[1:]:
    t = tr.text.split('\n')[1:4]
    d.append(t)

288

In [None]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood']
# instantiate the dataframe
ori_zip = pd.DataFrame(data=d, columns=column_names)

For not assigned boroughs, I deleted those rows and ignored them from the analysis. More than one neighborhood can exist in one postal code area. Those rows were combined into one row with the neighborhoods separated with a comma. If a cell has a borough but a not assigned neighborhood, then the neighborhood was assigned the same name as the borough.

In [382]:
# preprocessing
assigned_b = (ori_zip.Borough != 'Not assigned')
assigned_b = ori_zip[assigned_b].reset_index(drop=True)
grouped = assigned_b.groupby(['PostalCode','Borough'])
cleaned_zip = grouped['Neighborhood'].apply(', '.join).reset_index()
not_assgined = (cleaned_zip.Neighborhood == 'Not assigned')
cleaned_zip.loc[not_assgined, 'Neighborhood'] = cleaned_zip.loc[not_assgined, 'Borough']

In [475]:
cleaned_zip.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [387]:
cleaned_zip.shape

(103, 3)

 ## Get the Latitude and the Longitude Coordinates of Each Neighborhood

The problem with geocoder package is you have to be persistent sometimes in order to get the geographical coordinates of a given postal code. So you can make a call to get the latitude and longitude coordinates of a given postal code and the result would be None, and then make the call again and you would get the coordinates. So, in order to make sure that you get the coordinates for all of our neighborhoods, you can run a while loop for each postal code.

In [395]:
import geocoder

In [472]:
# initialize variable to None
#lat_lng_coords = None
#lat = []
#lng = []
# loop until get the coordinates
#while(lat_lng_coords is None):
    #for pc in cleaned_zip.PostalCode:
        #address = f'{pc}, Toronto, Ontario'
        #g = geocoder.arcgis(address)
        #print(g)
        #lat_lng_coords = g.latlng
        #lat.append(lat_lng_coords[0])
        #lng.append(lat_lng_coords[1])
# get latitude and longitude from csv
geocode = pd.read_csv('http://cocl.us/Geospatial_data')
geocode = geocode.rename({'Postal Code':'PostalCode'}, axis=1)
gc_zip = pd.merge(cleaned_zip, geocode, on = 'PostalCode')

In [474]:
gc_zip.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Explore and Cluster the Neighborhoods in Toronto

In [483]:
from geopy.geocoders import Nominatim
import folium

In [481]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [532]:
# create map of toronto using latitude and longitude values
map_to = folium.Map(location=[latitude, longitude], tiles='CartoDB dark_matter', zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(gc_zip['Latitude'], gc_zip['Longitude'], gc_zip['Borough'], gc_zip['Neighborhood']):
    label = f'neighborhood: {neighborhood}\n borough: {borough}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='#f9ba00',
        fill=True,
        fill_color='#f9ba00',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

In [531]:
CLIENT_ID = 'CMGOG4ZRSMWXCBOJ5L1ERRLDS0KBNCDZB5YIQUBYV0RSFESM' # Foursquare ID
CLIENT_SECRET = 'WSNVRUT4NVSFXME5VVG3XQIIVFKJWLC5PVI5OFWNFCJ1NZ5Z' # Foursquare Secret
VERSION = '20190420' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CMGOG4ZRSMWXCBOJ5L1ERRLDS0KBNCDZB5YIQUBYV0RSFESM
CLIENT_SECRET:WSNVRUT4NVSFXME5VVG3XQIIVFKJWLC5PVI5OFWNFCJ1NZ5Z


In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',                   
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], tiles='CartoDB dark_matter', zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters