## RZ's segmenting and clustering neighborhoods in Toronto 

## Part1. web page table scraping

In [149]:
!pip install beautifulsoup4



In [150]:
import requests
req = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
page = req.text

In [151]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page, 'html.parser')

In [152]:
req
page = req.text
type(page)
type(soup)
#soup

bs4.BeautifulSoup

In [153]:
#print(soup.prettify())

In [154]:
#len(soup.find_all('table'))
#soup.find_all('table')[2]
#soup.table['class']

In [155]:
#soup.find_all('table')#[1]#['class']

In [156]:
# get all the tables from web page
[t['class'] for t in soup.find_all('table') if t.get('class')]

[['wikitable', 'sortable'], ['navbox']]

In [157]:
#eyeball the class peoperty of the target table:
#table_html = str(soup.find('table','wikitable'))
#from IPython.core.display import HTML
#HTML(table_html)

In [158]:
# get each row from the target table
rows = [row for row in soup.find('table','wikitable').find_all('tr')]
#rows

In [159]:
#rows[2].get_text()

In [160]:
#define a lambada function to strip out the strings from each row or cell:
rem_nl = lambda s: s.replace('\n', '')#.strip()

### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood: done

In [161]:
# get the column from row1 of the table:
# the 1st column name was changed to 'Post Code' from 'Postal Code' on 07/19/2020, so the dataframe column name needs to be changed
columns = [rem_nl(col.get_text()) for col in rows[0].find_all("th") if col.get_text()]
columns.remove('Post Code')
columns.insert(0,'Postal Code')
columns

['Postal Code', 'Borough', 'Neighborhood']

In [162]:
#[rem_nl(col.get_text()) for col in rows[1].find_all("td") if col.get_text()]

In [163]:
# get a list of all cells from row2 and after from the table:
item = [rem_nl(value.get_text()) for row in rows[1:] for value in row.find_all("td")]

In [164]:
# stack the above list to a new list, where each item corresponds to a table row:
stacked_item = zip(*[item[i::3] for i in range(len(columns))])
#stacked_item

In [165]:
import pandas as pd
df_html = pd.DataFrame(stacked_item, columns=columns)
#df_html

In [166]:
#df_html.head()
#df_html.columns

### Ignore cells with a borough that is Not assigned. -- done

In [167]:
df = df_html[df_html['Borough'] != 'Not assigned']
df.reset_index(drop=True)
'Not assigned' in df['Borough'].unique()
#df

False

### More than one neighborhood can exist in one postal code area,For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: _Harbourfront_ and _Regent Park_. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table. -- no action needed

In [168]:
df_count=df['Postal Code'].value_counts()
type(df_count)
df_count.max()

1

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. -- no action needed

In [169]:
print("Does Column 'Neighborhood' have 'Not assigned'?",'Not assigned' in df['Neighborhood'].unique())
print("Does Column 'Borough' have 'Not assigned'?", 'Not assigned' in df['Borough'].unique())

Does Column 'Neighborhood' have 'Not assigned'? False
Does Column 'Borough' have 'Not assigned'? False


### In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe. -- done

In [170]:
df.shape

(103, 3)

## Part 2. adding the latitude and longtitude for each neighborhood

*******************************************************************************************************************************
Acquiring lattitude and longitude for each postal code by reading <mark>.csv</mark> file, bacause:
1. <mark>geocoder</mark> request from google was denied,
2. <mark>Nominatim</mark> from <mark>Geopy.geocoders</mark> only return location coordinates for the city instead of each postal code.

These 2 unsucessful methods were listed below for record reference.
*******************************************************************************************************************************

In [171]:
# geocoder method
!pip install geocoder
import geocoder
g = geocoder.google('{}, Toronto, Ontario'.format('M1A'))
g.latlng



In [172]:
print(g)

<[REQUEST_DENIED] Google - Geocode [empty]>


In [173]:
# Nominatim method
from geopy.geocoders import Nominatim
#address = '{}, Toronto, Ontario'.format('M1B')
address = 'Harbourfront West, Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of M1B Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of M1B Toronto are 43.6400801, -79.3801495.


Read the file and merge with dataframe file from Part1

In [174]:
import pandas as pd
df_loc=pd.read_csv('Geospatial_Coordinates.csv')
#df_loc

In [175]:
df_loc.shape
df_loc.loc[df_loc['Postal Code']=='M1B']

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353


In [178]:
df = pd.merge(df, df_loc, how='left', on=['Postal Code'])
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [116]:
df['Borough'][df['Neighborhood'].str.contains('Regent Park', na=False)]
df[df['Neighborhood'].str.contains('Regent Park', na=False)].loc[:,'Borough'].values[0]
df['Borough'][df['Neighborhood'].str.contains('Regent Park', na=False)].iloc[0]

'Downtown Toronto'

## Part 3: explore and cluster the neighborhoods:

### Step 1: create a list of neighborhoods, which are located in only boroughs that constain the word Toronto.

In [117]:
table_list = df['Neighborhood'][df['Borough'].str.contains("Toronto", na=False)].tolist()
table_string = ','.join(table_list)
#neighborhood_list = table_string.split(',')
#and remove the duplicated neighborhoods
neighborhood_list = set([x.strip() for x in table_string.split(',') ])
print(len(neighborhood_list))
#neighborhood_list

76


{'Adelaide',
 'Bathurst Quay',
 'Berczy Park',
 'Brockton',
 'Business reply mail Processing Centre',
 'CN Tower',
 'Cabbagetown',
 'Central Bay Street',
 'Chinatown',
 'Christie',
 'Church and Wellesley',
 'Commerce Court',
 'Davisville',
 'Davisville North',
 'Deer Park',
 'Design Exchange',
 'Dovercourt Village',
 'Dufferin',
 'Exhibition Place',
 'First Canadian Place',
 'Forest Hill North & West',
 'Forest Hill Road Park',
 'Forest Hill SE',
 'Garden District',
 'Grange Park',
 'Harbord',
 'Harbourfront',
 'Harbourfront East',
 'Harbourfront West',
 'High Park',
 'India Bazaar',
 'Island airport',
 'Kensington Market',
 'King',
 'King and Spadina',
 'Lawrence Park',
 'Little Portugal',
 'Moore Park',
 'North Midtown',
 'North Toronto West',
 'Ontario Provincial Government',
 'Parkdale',
 'Parkdale Village',
 "Queen's Park",
 'Railway Lands',
 'Rathnelly',
 'Regent Park',
 'Richmond',
 'Riverdale',
 'Roncesvalles',
 'Rosedale',
 'Roselawn',
 'Runnymede',
 'Ryerson',
 'South Central

In [118]:
table_string = ', '.join(table_list)

### Step 2: create a dataframe having all toronto neighborhoods and their latitude and longitude

In [119]:
neighborhoods_column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
neighborhoods = pd.DataFrame(columns = neighborhoods_column_names)
for neighborhood in neighborhood_list:
    borough = df['Borough'][df['Neighborhood'].str.contains(neighborhood, na=False)].iloc[0]
    #print(df['Borough'][df['Neighborhood'].str.contains(neighborhood, na=False)].iloc[0])
    neighborhood_name = neighborhood
    #print(neighborhood)
    
    #check out lat and lon for each neighborhood:
    address = neighborhood + ', Toronto, Canada'
    #print(address)
    geolocator = Nominatim(user_agent = 'toronto_explore')
    location = geolocator.geocode(address)
    neighborhood_lat = location.latitude if location else 'NA'
    neighborhood_lon = location.longitude if location else 'NA'
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                         'Neighborhood': neighborhood_name,
                                         'Latitude': neighborhood_lat,
                                         'Longitude': neighborhood_lon}, ignore_index=True)
# drop row where latitude or longitude is "NA"
neighborhoods.drop(neighborhoods.index[(neighborhoods['Latitude']=='NA') | (neighborhoods['Longitude']=='NA')],inplace=True)
#neighborhoods
#print(len(neighborhoods['Neighborhood'].unique()))

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,Harbourfront West,43.6401,-79.3801
1,Downtown Toronto,Richmond,43.8126,-79.2634
2,Central Toronto,North Midtown,43.7057,-79.3976
3,Downtown Toronto,King and Spadina,43.6455,-79.395
4,Downtown Toronto,Toronto Dominion Centre,43.6474,-79.3814
5,West Toronto,Brockton,43.6509,-79.44
6,Downtown Toronto,Harbourfront,43.6401,-79.3801
7,Central Toronto,Forest Hill North & West,43.701,-79.4256
8,East Toronto,India Bazaar,43.6722,-79.3235
9,Central Toronto,Summerhill East,43.6817,-79.3905


In [120]:
#neighborhoods['Neighborhood'].value_counts

<bound method IndexOpsMixin.value_counts of 0            Harbourfront West
1                     Richmond
2                North Midtown
3             King and Spadina
4      Toronto Dominion Centre
5                     Brockton
6                 Harbourfront
7     Forest Hill North & West
8                 India Bazaar
9              Summerhill East
10                   Runnymede
11           Harbourfront East
12                     Harbord
13                 Regent Park
14             Design Exchange
15                     Swansea
16                 Cabbagetown
17                    Rosedale
18                  South Hill
20            Parkdale Village
21              Commerce Court
24                        King
25                  Davisville
26             Summerhill West
27              Forest Hill SE
28                    Roselawn
29                    CN Tower
30                   The Annex
31                   High Park
32                   Rathnelly
                ...       

### Step 3: loop Foursquare data into toronto neighborhood, and build a venue dataframe which includes at least neighborhood, its latitude and longitude, venue name and its category type

Prepare Foursquare request:

In [121]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: XQVCMBRRS2RYJ2DTNOKD5DN33Y5GC3G02ECTVUMWOGN30ROC
CLIENT_SECRET:LNCK1XXVJVAWX1NWRPIGGENTZT2XBMP3QUTBOTLRMHT40NNS


LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

In [122]:
# create a function to repeat the process acquiring venues:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    timer=0
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        timer+=1
        #print(timer, 'done')

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [123]:
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                 latitudes=neighborhoods['Latitude'],
                                 longitudes=neighborhoods['Longitude']
                                )

In [124]:
print(toronto_venues.shape)
toronto_venues.head(5)

(3672, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront West,43.64008,-79.38015,Harbour Square Park,43.639253,-79.378395,Park
1,Harbourfront West,43.64008,-79.38015,Lake Ontario,43.638945,-79.379665,Lake
2,Harbourfront West,43.64008,-79.38015,Harbourfront,43.639526,-79.380688,Neighborhood
3,Harbourfront West,43.64008,-79.38015,BeaverTails,43.639736,-79.380068,Dessert Shop
4,Harbourfront West,43.64008,-79.38015,Miku,43.641374,-79.377531,Japanese Restaurant


In [125]:
len(toronto_venues['Venue Category'].unique())

286

In [126]:
#toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,23,23,23,23,23,23
Berczy Park,100,100,100,100,100,100
Brockton,17,17,17,17,17,17
CN Tower,57,57,57,57,57,57
Cabbagetown,48,48,48,48,48,48
Central Bay Street,100,100,100,100,100,100
Chinatown,68,68,68,68,68,68
Christie,57,57,57,57,57,57
Church and Wellesley,72,72,72,72,72,72


### Step 4: <mark>onehot</mark> venue type, and group by neighborhood 

In [127]:
# on hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')

# add neighborhood column back to dataframe, but change the column name to neighborhoodname
# as neighborhood is one of the venue categories
toronto_onehot['Neighborhoodname'] = toronto_venues['Neighborhood']

#move neighborhoodname column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhoodname,Accessories Store,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Harbourfront West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harbourfront West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Harbourfront West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Harbourfront West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
# group rows by neighborhoodname and by taking the mean of the frequency of 
# occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhoodname').mean().reset_index()
#toronto_grouped

Unnamed: 0,Neighborhoodname,Accessories Store,African Restaurant,Airport,Airport Service,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,...,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.000000,0.000000,0.000000,0.000000,0.040000,0.000000,0.000000,0.000000,0.010000,...,0.000000,0.00,0.010000,0.000000,0.000000,0.000000,0.010000,0.000000,0.000000,0.000000
1,Bathurst Quay,0.000000,0.000000,0.043478,0.043478,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.043478,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Berczy Park,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.020000,...,0.000000,0.00,0.010000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010000
3,Brockton,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.117647,0.000000,0.000000,0.000000,0.000000
4,CN Tower,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017544,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017544
5,Cabbagetown,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,Central Bay Street,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010000,...,0.000000,0.00,0.010000,0.000000,0.010000,0.000000,0.000000,0.000000,0.020000,0.010000
7,Chinatown,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.014706,...,0.000000,0.00,0.029412,0.014706,0.000000,0.029412,0.014706,0.000000,0.000000,0.000000
8,Christie,0.000000,0.000000,0.000000,0.000000,0.017544,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.017544,0.017544,0.017544,0.000000,0.000000,0.000000
9,Church and Wellesley,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.027778


In [129]:
toronto_grouped.shape

(70, 287)

### Step 5: select a certain number of the most popular venue categories for each neighborhood

In [130]:
# first, wite a function to sort the venues in descending order 
# and return corresponding venue categories
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [131]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhoodname']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhoodname'] = toronto_grouped['Neighborhoodname']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhoodname,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Restaurant,Café,Gym,Coffee Shop,American Restaurant,Cosmetics Shop,Clothing Store,Italian Restaurant,Seafood Restaurant,Japanese Restaurant
1,Bathurst Quay,Coffee Shop,Café,Park,Sushi Restaurant,Japanese Restaurant,Ramen Restaurant,Caribbean Restaurant,Diner,Harbor / Marina,Sculpture Garden
2,Berczy Park,Coffee Shop,Café,Italian Restaurant,Restaurant,Hotel,Japanese Restaurant,Beer Bar,Bakery,Seafood Restaurant,Gastropub
3,Brockton,Bar,Vietnamese Restaurant,Park,Jazz Club,Gastropub,Korean Restaurant,Portuguese Restaurant,Café,French Restaurant,Bakery
4,CN Tower,Hotel,Coffee Shop,Bar,Pizza Place,Baseball Stadium,Gym,Ice Cream Shop,Scenic Lookout,Yoga Studio,Sandwich Place


### Step 6: run _k_-means to cluster the neighborhood into 5 clusters(dropping 'Neighborhood' for clustering), and then insert cluster labels

In [132]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhoodname', 1)
#toronto_grouped_clustering

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 0, 1, 1, 0, 1, 1, 1, 0])

In [133]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
#neighborhoods_venues_sorted

In [134]:
toronto_merged = neighborhoods
toronto_merged = toronto_merged.rename(columns = {"Neighborhood":"Neighborhoodname"})
#toronto_merged

Unnamed: 0,Borough,Neighborhoodname,Latitude,Longitude
0,Downtown Toronto,Harbourfront West,43.6401,-79.3801
1,Downtown Toronto,Richmond,43.8126,-79.2634
2,Central Toronto,North Midtown,43.7057,-79.3976
3,Downtown Toronto,King and Spadina,43.6455,-79.395
4,Downtown Toronto,Toronto Dominion Centre,43.6474,-79.3814
5,West Toronto,Brockton,43.6509,-79.44
6,Downtown Toronto,Harbourfront,43.6401,-79.3801
7,Central Toronto,Forest Hill North & West,43.701,-79.4256
8,East Toronto,India Bazaar,43.6722,-79.3235
9,Central Toronto,Summerhill East,43.6817,-79.3905


In [135]:
# merge neighborhoods with neighborhoods_venues_sorted to add latitude/longtitude
# for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhoodname'), on='Neighborhoodname')

toronto_merged.head()

Unnamed: 0,Borough,Neighborhoodname,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,Harbourfront West,43.6401,-79.3801,0,Coffee Shop,Café,Restaurant,Hotel,Italian Restaurant,Bank,Brewery,Sandwich Place,Chinese Restaurant,Plaza
1,Downtown Toronto,Richmond,43.8126,-79.2634,4,Beer Store,Ice Cream Shop,Park,Yoga Studio,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
2,Central Toronto,North Midtown,43.7057,-79.3976,0,Coffee Shop,Italian Restaurant,Sushi Restaurant,Pub,Fast Food Restaurant,Mexican Restaurant,Restaurant,Deli / Bodega,Gym,Bookstore
3,Downtown Toronto,King and Spadina,43.6455,-79.395,1,Restaurant,Coffee Shop,Italian Restaurant,Hotel,Beer Bar,Taco Place,Bar,French Restaurant,Café,Pizza Place
4,Downtown Toronto,Toronto Dominion Centre,43.6474,-79.3814,0,Coffee Shop,Hotel,Café,American Restaurant,Restaurant,Japanese Restaurant,Seafood Restaurant,Gym,Steakhouse,Italian Restaurant


### Step 7: visualize the resulting clusters

In [136]:
# acquire lat/long for Toronto
toronto_address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(toronto_address)
tor_latitude = location.latitude
tor_longitude = location.longitude
print('The geograpical coordinate Toronto are {}, {}.'.format(tor_latitude, tor_longitude))

The geograpical coordinate Toronto are 43.6534817, -79.3839347.


In [137]:
import folium # map rendering library
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
#ys = [i + x + (i*x)**2 for i in range(kclusters)]
#colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
colors_array = cm.rainbow(np.linspace(0, 1, len(x)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhoodname'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Step 8: analyze the cluster by listing the most popular venues for each cluster

For each cluster, the most popular 10 venue categories of each neighborhood were put together. The first 10 venue categories were the most popular one of each cluster, from which we probably can tell the difference among clusters. 

- Cluster 0: business area, as popular venues including banks, hotels, and   restaurants;
- Cluster 1: shopping area, with clothing stores, bars, and restaurants;
- Cluster 2: suburban and residence area, with farm, bike trail, and event space;
- Cluster 3: receation and residence area, with tennis court, gym;
- Cluster 4: residence area similar to Cluster 2, also with park, farm and even spaces;

In [138]:
df_cluster0=toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[list(range(5, toronto_merged.shape[1]))]]
cluster0_cate_list=[]
for i in range(0,df_cluster0.shape[0]):
    cluster0_cate_list = cluster0_cate_list + df_cluster0.iloc[i].to_list()
#print(cluster0_cate_list)
#print(len(cluster0_cate_list))
import collections
counter=collections.Counter(cluster0_cate_list)
#print(len(counter))
print(counter.most_common(10))

[('Coffee Shop', 27), ('Café', 19), ('Restaurant', 17), ('Italian Restaurant', 16), ('Hotel', 12), ('Japanese Restaurant', 12), ('Sandwich Place', 9), ('Sushi Restaurant', 9), ('Bank', 8), ('Deli / Bodega', 7)]


Cluster 0: has business and restaurants

In [139]:
df_cluster1=toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[list(range(5, toronto_merged.shape[1]))]]
cluster1_cate_list=[]
for i in range(0,df_cluster1.shape[0]):
    cluster1_cate_list = cluster1_cate_list + df_cluster1.iloc[i].to_list()
counter=collections.Counter(cluster1_cate_list)
print(counter.most_common(10))

[('Coffee Shop', 27), ('Café', 26), ('Bar', 20), ('Pizza Place', 14), ('Restaurant', 13), ('Park', 13), ('Bakery', 11), ('Italian Restaurant', 8), ('Sandwich Place', 8), ('Clothing Store', 8)]


Cluster 1: with park and clothing store, this cluster is residence cluster

In [140]:
df_cluster2=toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[list(range(5, toronto_merged.shape[1]))]]
cluster2_cate_list=[]
for i in range(0,df_cluster2.shape[0]):
    cluster2_cate_list = cluster2_cate_list + df_cluster2.iloc[i].to_list()
counter=collections.Counter(cluster2_cate_list)
print(counter.most_common(10))

[('Playground', 2), ('Park', 2), ('Falafel Restaurant', 2), ('Farm', 2), ('Bike Trail', 1), ('Eastern European Restaurant', 1), ('Egyptian Restaurant', 1), ('Electronics Store', 1), ('Ethiopian Restaurant', 1), ('Event Space', 1)]


In [141]:
df_cluster3=toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[list(range(5, toronto_merged.shape[1]))]]
cluster3_cate_list=[]
for i in range(0,df_cluster3.shape[0]):
    cluster3_cate_list = cluster3_cate_list + df_cluster3.iloc[i].to_list()
counter=collections.Counter(cluster3_cate_list)
print(counter.most_common(10))

[('Tennis Court', 1), ('Gym', 1), ('Trail', 1), ('Park', 1), ('Doner Restaurant', 1), ('Donut Shop', 1), ('Dumpling Restaurant', 1), ('Eastern European Restaurant', 1), ('Egyptian Restaurant', 1), ('Electronics Store', 1)]


In [142]:
df_cluster4=toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[list(range(5, toronto_merged.shape[1]))]]
cluster4_cate_list=[]
for i in range(0,df_cluster4.shape[0]):
    cluster4_cate_list = cluster4_cate_list + df_cluster4.iloc[i].to_list()
counter=collections.Counter(cluster4_cate_list)
print(counter.most_common(10))

[('Park', 2), ('Egyptian Restaurant', 2), ('Electronics Store', 2), ('Ethiopian Restaurant', 2), ('Event Space', 2), ('Falafel Restaurant', 2), ('Farm', 2), ('Beer Store', 1), ('Ice Cream Shop', 1), ('Yoga Studio', 1)]
