<h1>Segmenting and clustering neighborhoods in the city of Toronto Canada

<h4>This notebook includes 1,2 and 3 parts of week 3 assignment

In [46]:
#import Libraries

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
import os
import requests # library to handle requests
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 



print('Libraries imported.')

Libraries imported.


<h2>Part 1

In [47]:
#Importing Beautifulsoup to scrape wikipedia webpage

from bs4 import BeautifulSoup 
import csv 

wikiurl = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wikicontent = requests.get(wikiurl) #requesting the content of the wikipedia web page
print(wikicontent.content[:500]) #print a small part of the page

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ff1dbaab-0815-40f9-9a0'


In [48]:
#parse the html into a beautifulsoup object
wikisoup = BeautifulSoup(wikicontent.content, 'html.parser') 


In [49]:
# Create lists to store the scraped data in
PostalCodeList = []
BoroughList = []
NeighborhoodList = []

In [50]:
# store the data into the lists
for row in wikisoup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        PostalCodeList.append(cells[0].text)
        BoroughList.append(cells[1].text)
        NeighborhoodList.append(cells[2].text) 

In [51]:
#create a new DataFrame from the three lists
toronto_dfn = pd.DataFrame({"PostalCode": PostalCodeList,
                           "Borough": BoroughList,
                           "Neighborhood": NeighborhoodList})

toronto_df = toronto_dfn.replace('\n',' ', regex=True) #remove the newlines '\n' from the cells 

toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [52]:
#drop not assigned cells in Borough

toronto_newdf = toronto_df.drop(toronto_df[toronto_df.Borough == "Not assigned "].index)
toronto_newdf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [53]:
# group neighborhoods in the same borough
toronto_df_grp = toronto_newdf.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grp.head(10)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [54]:
# if Neighborhood="Not assigned", make the value the same as Borough
for index, row in toronto_df_grp.iterrows():
    if row["Neighborhood"] == "Not assigned ":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [55]:
# print the number of rows of the cleaned dataframe
toronto_df_grp.shape

(103, 3)

<h2> Part 2

In [56]:
#import geographical coordinates from csv into the dataframe dfgeo

geocsv='https://cocl.us/Geospatial_data'
dfgeo = pd.read_csv(geocsv)

dfgeo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [57]:
#Rename the Postal Code column to PostalCode

dfgeonew = dfgeo.rename(columns={'Postal Code': 'PostalCode'})
dfgeonew.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [59]:
#Trim PostalCode column and merge the tables

toronto_df_grp['PostalCode'] = toronto_df_grp['PostalCode'].str.strip() #remove whitespace from PostalCode column

toronto_ndf=pd.merge(toronto_df_grp, dfgeonew, on='PostalCode', how="right")

toronto_ndf.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<h2>Part 3

In [22]:
#install folium 

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ------------------------------------------------------------
                       

In [60]:
#install geopy
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longtitude values


In [61]:
#working with boroughs of toronto

toronto_bor=toronto_ndf[toronto_ndf['Borough'].str.contains("Toronto")]
toronto_bor.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [62]:
#Use geopy library to get the latitude and longitude values of Toronto.

address = ['Toronto Canada']
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [63]:
#Create a map of Toronto with neighborhoods superimposed on top.

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_bor['Latitude'], toronto_bor['Longitude'], toronto_bor['Borough'], toronto_bor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Utilizing the Foursquare API to explore the neighborhoods and segment them.

In [74]:
#Define Foursquare Credentials and Version
CLIENT_ID = '05H45WXNVDOCSKSJ0KIBLKAK4EZJ25BRNCJZLS4TT2CI4Z5N' # your Foursquare ID
CLIENT_SECRET = 'XHGLXP23BMRY3ONDZJB2OUIHHZDYM4IUBDDNCWJ0ZZ2RFC4E' # your Foursquare Secret
VERSION = '20200814' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)



Your credentails:
CLIENT_ID: 05H45WXNVDOCSKSJ0KIBLKAK4EZJ25BRNCJZLS4TT2CI4Z5N
CLIENT_SECRET:XHGLXP23BMRY3ONDZJB2OUIHHZDYM4IUBDDNCWJ0ZZ2RFC4E


In [65]:
#Get the first neighborhood's name.

print(toronto_bor.head(1))



   PostalCode        Borough  Neighborhood   Latitude  Longitude
37        M4E  East Toronto   The Beaches   43.676357 -79.293031


In [66]:
#Let's explore the first neighborhood "The Beaches" in our dataframe.

neighborhood_latitude = toronto_bor.loc[37, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_bor.loc[37, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_bor.loc[37, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches  are 43.67635739999999, -79.2930312.


In [67]:
#get the top recommended places that are in "The Beaches" within a radius of 500 meters.

#create the GET request URL
LIMIT = 50
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id=05H45WXNVDOCSKSJ0KIBLKAK4EZJ25BRNCJZLS4TT2CI4Z5N&client_secret=XHGLXP23BMRY3ONDZJB2OUIHHZDYM4IUBDDNCWJ0ZZ2RFC4E&v=20200814&ll=43.67635739999999,-79.2930312&radius=500&limit=50'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL




'https://api.foursquare.com/v2/venues/explore?&client_id=05H45WXNVDOCSKSJ0KIBLKAK4EZJ25BRNCJZLS4TT2CI4Z5N&client_secret=XHGLXP23BMRY3ONDZJB2OUIHHZDYM4IUBDDNCWJ0ZZ2RFC4E&v=20200814&ll=43.67635739999999,-79.2930312&radius=500&limit=50'

In [68]:
#Send the GET request and examine the resutls

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f37a3ecc84c6d03985e49fd'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.67682094413784,
          'lng': -79.29394208780985}],
        'distanc

In [69]:
# function that extracts the category of the recommended places
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [70]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

#clean the json and structure it into a *pandas* dataframe.

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [71]:
#And how many recommended places were returned by Foursquare?

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


<h4>Locations of the Venues on the map

In [72]:
map_thebeaches = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, name, categories in zip(nearby_venues['lat'], nearby_venues['lng'], nearby_venues['name'], nearby_venues['categories']):
    label = '{}, {}'.format(name, categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_thebeaches)  
    
map_thebeaches