In [93]:
#Code for "segmenting and CLustering Neighborhoods in Toronoto" Assignment: 
#Importing Packages

!pip install folium
import folium
import pandas as pd
import bs4 as bs
import requests
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe


# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors



In [94]:
#Scraping Wiki Text

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)
soup = bs.BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))


In [95]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [96]:
#excluding the Boroughs that don't have an assigned value
rawdata=data[data['Borough'] != 'Not assigned']


In [97]:
rawdata=rawdata.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)

rawdata.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


In [98]:
rawdata['Neighbourhood'] = np.where(rawdata['Neighbourhood'] == 'Not assigned', rawdata['Borough'], rawdata['Neighbourhood'])
rawdata.shape
#End of Part 1 (points 1-3)

(103, 3)

In [99]:
# Getting Lat/Long of Each Hood

geospa_url = 'http://cocl.us/Geospatial_data'
geospa_data = pd.read_csv(geospa_url)

In [100]:
geospa_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [101]:
geospa_data.columns = ['Postal Code','Latitude','Longitude']
geospa_data.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [102]:
combo_data=pd.merge(rawdata,geospa_data, on= 'Postal Code')
combo_data.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


In [121]:
#Explore Toronto

address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto city are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto city are 43.6534817, -79.3839347.


In [122]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

In [123]:
for lat, lng, borough, neighborhood in zip(
        combo_data['Latitude'], 
        combo_data['Longitude'], 
        combo_data['Borough'], 
        combo_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

In [144]:
combo_data_tor = combo_data[combo_data['Borough'].str.contains("Toronto")].reset_index(drop=True)
combo_data_tor.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


In [143]:
combo_data_tor = folium.Map(location=[latitude, longitude], zoom_start=12)
for lat, lng, borough, neighborhood in zip(
        combo_data_tor['Latitude'], 
        combo_data_tor['Longitude'], 
        combo_data_tor['Borough'], 
        combo_data_tor['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(combo_data_tor)  

combo_data_tor

In [163]:
# Define Foursquare Credentials and Version

CLIENT_ID = 'SPWJYEA3TPHEFKBZGQ2YDUE3AGFZKE21XZNA1HT215CWMAVJ' # your Foursquare ID
CLIENT_SECRET = 'OT3JSNAAREBM2H2520LYEDEW1JUNVEMDIAFQGHGMCFFCB1AE' # your Foursquare Secret
VERSION = '20161225' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SPWJYEA3TPHEFKBZGQ2YDUE3AGFZKE21XZNA1HT215CWMAVJ
CLIENT_SECRET:OT3JSNAAREBM2H2520LYEDEW1JUNVEMDIAFQGHGMCFFCB1AE


In [165]:

LIMIT = 10
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(combo_data_tor.Neighbourhood, combo_data_tor.Latitude, combo_data_tor.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [171]:
temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
temp.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Davisville North,43.712751,-79.390197,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot
2,"North Toronto West, Lawrence Park",43.715383,-79.405678,Barreworks,43.71407,-79.400109,Yoga Studio
3,Davisville,43.704324,-79.38879,Jules Cafe Patisserie,43.704138,-79.388413,Dessert Shop
4,"Moore Park, Summerhill East",43.689574,-79.38316,Ravine,43.690356,-79.386841,Trail


In [170]:
print("{} nearby locations downloaded for {} neighbourhood.".format(len(temp.Venue), len(combo_data_tor.Neighbourhood)))


39 nearby locations downloaded for 39 neighbourhood.


In [178]:
cat = pd.get_dummies(temp.category) # one hot encoding with get_dummies()
df_01 = pd.concat([temp[['Neighbourhood']], cat], axis=1) # combine neighbourhood & category tables
df_01.head()

Unnamed: 0,Neighbourhood,Airport,Bakery,Bar,Breakfast Spot,Brewery,Café,Clothing Store,Coffee Shop,Dessert Shop,...,Japanese Restaurant,Liquor Store,Mexican Restaurant,Neighborhood,Park,Playground,Restaurant,Theme Restaurant,Trail,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Davisville North,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"North Toronto West, Lawrence Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Davisville,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,"Moore Park, Summerhill East",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [173]:
df_02 = df_01.drop('Neighbourhood', axis=1)

n_group = 6 # we will group neighbourhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df_02)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 0, 0, 0, 5, 0, 0, 5, 2, 0], dtype=int32)

In [180]:
# add clustering labels
df_01.insert(1, 'Lwhabel', kmeans.labels_)
df_01.head()

Unnamed: 0,Neighbourhood,label,Airport,Bakery,Bar,Breakfast Spot,Brewery,Café,Clothing Store,Coffee Shop,...,Japanese Restaurant,Liquor Store,Mexican Restaurant,Neighborhood,Park,Playground,Restaurant,Theme Restaurant,Trail,Yoga Studio
0,Lawrence Park,3,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Davisville North,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"North Toronto West, Lawrence Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Moore Park, Summerhill East",5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [181]:
combo_data_tor_merged = pd.merge(combo_data_tor, df_01, on='Neighbourhood', how='right')
combo_data_tor_merged.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude,label,Airport,Bakery,Bar,Breakfast Spot,...,Japanese Restaurant,Liquor Store,Mexican Restaurant,Neighborhood,Park,Playground,Restaurant,Theme Restaurant,Trail,Yoga Studio
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879,3,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Central Toronto,M4S,Davisville,43.704324,-79.38879,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316,5,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [186]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(n_group)
ys = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(combo_data_tor_merged['Latitude'], combo_data_tor_merged['Longitude'], combo_data_tor_merged['Neighborhood'], combo_data_tor_merged['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters