Segmenting and Clustering Neighborhoods in Toronto

Part 1: Web scraping for Toronto neighborhood and build a clean dataframe

Used the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe.


In [58]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [59]:
!conda install -c conda-forge beautifulsoup4 --yes

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [232]:
import requests
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup


In [233]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns=column_names)
df

Unnamed: 0,PostalCode,Borough,Neighborhood


In [234]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [235]:
#Scrape the HTML
data = requests.get(url).text
soup = BeautifulSoup(data,'html.parser')

In [236]:
PostalCodeList = []
BoroughList = []
Neighborhood = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells)>0):
        PostalCodeList.append(cells[0].text)
        BoroughList.append(cells[1].text)
        Neighborhood.append(cells[2].text.rstrip('\n'))


In [237]:
#INSTRUCT THE DATAFRAME
toronto_neighborhood = [('PostalCode', PostalCodeList),
                       ('Borough', BoroughList),
                       ('Neighborhood', Neighborhood)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighborhood))
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [238]:
#CLEAN THE NOT-ASSIGNED DATA
toronto_neighborhood = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [239]:
#GROUP BY POSTALCODE
toronto_neighborhood = toronto_neighborhood.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x:','.join(x))

In [240]:
toronto_neighborhood.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [241]:
toronto_neighborhood.shape

(103, 3)

Part 2
Getting Postal Codes Lat/Lng

In [242]:
locations = pd.read_csv('https://cocl.us/Geospatial_data')
locations.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [243]:
toronto_df = pd.merge(left = toronto_neighborhood, right = locations, left_on='PostalCode', right_on='Postal Code')
toronto_df.drop(columns=['Postal Code'], inplace=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [244]:
toronto_df.describe(include='all')

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
count,103,103,103,103.0,103.0
unique,103,11,103,,
top,M6A,North York,"The Kingsway,Montgomery Road,Old Mill North",,
freq,1,24,1,,
mean,,,,43.704608,-79.397153
std,,,,0.052463,0.097146
min,,,,43.602414,-79.615819
25%,,,,43.660567,-79.464763
50%,,,,43.696948,-79.38879
75%,,,,43.74532,-79.340923


Part 3
Explore and cluster the neighborhood in Toronto

In [245]:
!conda install -c conda-forge geopy --yes

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [246]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [247]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [248]:
#USE FOLIUM TO CREATE A MAP OF TORONTO WITH BOROUGHS IMPOSED ON MAP
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius = 5, popup = label, color = 'blue', fill=True, fill_color = '#3186cc',fill_opacity=0.7, parse_html=False).add_to(map_toronto)

map_toronto

Reduce the number of boroughs to explore deeper

In [249]:
toronto_df['Borough']

0           Scarborough
1           Scarborough
2           Scarborough
3           Scarborough
4           Scarborough
5           Scarborough
6           Scarborough
7           Scarborough
8           Scarborough
9           Scarborough
10          Scarborough
11          Scarborough
12          Scarborough
13          Scarborough
14          Scarborough
15          Scarborough
16          Scarborough
17           North York
18           North York
19           North York
20           North York
21           North York
22           North York
23           North York
24           North York
25           North York
26           North York
27           North York
28           North York
29           North York
             ...       
73                 York
74                 York
75     Downtown Toronto
76         West Toronto
77         West Toronto
78         West Toronto
79           North York
80                 York
81                 York
82         West Toronto
83         West 

In [250]:
boroughs = ['North York','Scargorough', 'Richmond', 'Downtown Toronto','East Toronto']
toronto_borough = toronto_df[toronto_df['Borough'].isin(boroughs).reset_index(drop=True)]
print(toronto_borough.shape)

(47, 5)


In [251]:
toronto_borough.index = range(len(toronto_borough))
toronto_borough.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493


In [252]:
map_toronto_borough = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_borough['Latitude'], toronto_borough['Longitude'], toronto_borough['Borough'], toronto_borough['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius = 5, popup = label, color = 'blue', fill=True, fill_color = '#3186cc',fill_opacity=0.7, parse_html=False).add_to(map_toronto_borough)

map_toronto_borough

Part 4
Utilizing FourSquare API to explore the Boroughs

In [None]:
#Removed CLIENT ID and CLIENT SECRET for privacy

In [254]:
toronto_borough.loc[0,'Neighborhood']

'Hillcrest Village'

In [255]:
neigh_lat = toronto_borough.loc[0,'Latitude']
neigh_lng = toronto_borough.loc[0, 'Longitude']
neigh_name = toronto_borough.loc[0,'Neighborhood']


In [257]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d8ff4b2c58ed7002cbf6396'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.808262204500004,
    'lng': -79.3572281853783},
   'sw': {'lat': 43.7992621955, 'lng': -79.3696752146217}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad9dce6f964a520651b21e3',
       'name': "Eagle's Nest Golf Club",
       'location': {'address': '10000 Dufferin Rd',
        'lat': 43.805454826002794,
        'lng': -79.36418592243415,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.805454826002794,
          'lng': -79.36418592243415}],
        'distance': 197,
        'cc': 'CA',
        'city': 'Toronto

Part 5
Analyze Venues in Each Neighborhood

In [None]:
#Explore All Neighborhoods in Toronto

In [258]:
#ALL THE INFO WE NEED IS IN THE 'ITEMS'
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [259]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_borough['Latitude'], toronto_borough['Longitude'], toronto_borough['PostalCode'], toronto_borough['Borough'], toronto_borough['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [260]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1672, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M2H,North York,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,M2H,North York,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,M2H,North York,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,M2H,North York,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run
4,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store


In [261]:
venues_df.groupby(['PostalCode','Borough','Neighborhood'])['VenueName'].count()

PostalCode  Borough           Neighborhood                                                                                        
M2H         North York        Hillcrest Village                                                                                         4
M2J         North York        Fairview,Henry Farm,Oriole                                                                               66
M2K         North York        Bayview Village                                                                                           4
M2M         North York        Newtonbrook,Willowdale                                                                                    1
M2N         North York        Willowdale South                                                                                         36
M2P         North York        York Mills West                                                                                           3
M2R         North York        Willowdale 

In [262]:
venues_df['VenueCategory'].nunique()

240

In [266]:
venues_df.loc[0,'Neighborhood']

'Hillcrest Village'

In [267]:
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(1672, 243)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M2H,North York,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M2J,North York,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [300]:
toronto_onehot_freq = toronto_onehot.groupby(['PostalCode','Borough','Neighborhood']).mean().reset_index()
print(toronto_onehot_freq.shape)
toronto_onehot_freq.head()

(50, 242)


Unnamed: 0,PostalCode,Borough,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M2H,North York,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M2J,North York,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.015152,0.030303,0.0
2,M2K,North York,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M2M,North York,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M2N,North York,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0


In [301]:
#Get the top 10 venues with highest frequency
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhood']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_onehot_freq['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_onehot_freq['Borough']
neighborhoods_venues_sorted['Neighborhood'] = toronto_onehot_freq['Neighborhood']

for ind in np.arange(toronto_onehot_freq.shape[0]):
    row_categories = toronto_onehot_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted

Unnamed: 0,PostalCode,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
40,M5V,Downtown Toronto,0,Airport Lounge,Airport Service,Airport Terminal,Sculpture Garden,Harbor / Marina,Boat or Ferry,Plane,Coffee Shop,Boutique,Bar
49,M9M,North York,0,Baseball Field,Furniture / Home Store,Yoga Studio,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant
14,M3M,North York,0,Baseball Field,Home Service,Food Truck,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
9,M3C,North York,0,Beer Store,Coffee Shop,Gym,Asian Restaurant,Sandwich Place,Japanese Restaurant,Sporting Goods Shop,Fast Food Restaurant,Smoke Shop,Discount Store
38,M5S,Downtown Toronto,0,Café,Bar,Restaurant,Japanese Restaurant,Sandwich Place,Bookstore,Bakery,Italian Restaurant,Beer Bar,Beer Store
21,M4M,East Toronto,0,Café,Coffee Shop,American Restaurant,Bakery,Italian Restaurant,Yoga Studio,Coworking Space,Park,Middle Eastern Restaurant,Latin American Restaurant
39,M5T,Downtown Toronto,0,Café,Vegetarian / Vegan Restaurant,Bar,Dumpling Restaurant,Mexican Restaurant,Vietnamese Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Burger Joint
2,M2K,North York,0,Chinese Restaurant,Japanese Restaurant,Bank,Café,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
27,M5B,Downtown Toronto,0,Clothing Store,Coffee Shop,Middle Eastern Restaurant,Café,Cosmetics Shop,Bubble Tea Shop,Italian Restaurant,Plaza,Pizza Place,Bookstore
1,M2J,North York,0,Clothing Store,Fast Food Restaurant,Coffee Shop,Food Court,Bakery,Asian Restaurant,Women's Store,Japanese Restaurant,Toy / Game Store,Jewelry Store


Part 6
Cluster the Neighborhoods

In [302]:
toronto_grouped = toronto_onehot.groupby('Neighborhoods').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0
2,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Bedford Park,Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
6,"CFB Toronto,Downsview East",0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,...,0.0,0.0,0.011628,0.0,0.0,0.0,0.011628,0.0,0.0,0.011628


In [271]:
#Print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in toronto_grouped['Neighborhoods']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhoods'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
          venue  freq
0   Coffee Shop  0.08
1          Café  0.05
2           Bar  0.04
3    Steakhouse  0.04
4  Burger Joint  0.03


----Bathurst Manor,Downsview North,Wilson Heights----
              venue  freq
0       Coffee Shop  0.11
1       Pizza Place  0.05
2       Supermarket  0.05
3              Bank  0.05
4  Sushi Restaurant  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2                 Café  0.25
3  Japanese Restaurant  0.25
4        Metro Station  0.00


----Bedford Park,Lawrence Manor East----
                  venue  freq
0    Italian Restaurant  0.09
1           Coffee Shop  0.09
2  Fast Food Restaurant  0.04
3        Sandwich Place  0.04
4          Liquor Store  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2          Steakhouse  0.04
3  Italian Restaurant  0.04
4      Farmers Market  0.04


----Busine

                venue  freq
0    Greek Restaurant  0.20
1         Coffee Shop  0.09
2  Italian Restaurant  0.07
3           Bookstore  0.05
4      Ice Cream Shop  0.05


----Victoria Village----
                        venue  freq
0                Intersection  0.25
1                 Coffee Shop  0.25
2                Hockey Arena  0.25
3       Portuguese Restaurant  0.25
4  Modern European Restaurant  0.00


----Willowdale South----
                 venue  freq
0     Ramen Restaurant  0.08
1          Coffee Shop  0.08
2          Pizza Place  0.06
3  Japanese Restaurant  0.06
4       Sandwich Place  0.06


----Willowdale West----
            venue  freq
0        Pharmacy  0.14
1   Grocery Store  0.14
2  Discount Store  0.14
3    Home Service  0.14
4         Butcher  0.14


----York Mills West----
               venue  freq
0               Park  0.33
1  Convenience Store  0.33
2               Bank  0.33
3  Accessories Store  0.00
4      Movie Theater  0.00




In [303]:
#To sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [304]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,Hotel,Cosmetics Shop,Thai Restaurant,American Restaurant,Burger Joint,Restaurant
1,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Sushi Restaurant,Bridal Shop,Supermarket,Bank,Deli / Bodega,Middle Eastern Restaurant,Fast Food Restaurant,Restaurant,Fried Chicken Joint
2,Bayview Village,Chinese Restaurant,Bank,Café,Japanese Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
3,"Bedford Park,Lawrence Manor East",Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Boutique,Breakfast Spot,Juice Bar,Restaurant,Fast Food Restaurant,Pub
4,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Seafood Restaurant,Steakhouse,Italian Restaurant,Café,Farmers Market,Bakery,Beer Bar


In [305]:
toronto_grouped_clustering = toronto_grouped
toronto_grouped_clustering.head()
toronto_grouped_clustering.drop('Neighborhoods',axis=1, inplace=True)

In [306]:
toronto_grouped_clustering.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [307]:
kclusters = 5
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 3, 0, 0, 0], dtype=int32)

In [308]:
#add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto_df


In [313]:
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [314]:
toronto_merged = toronto_merged[['PostalCode','Borough','Neighborhood','Latitude','Longitude']]
toronto_merged = neighborhoods_venues_sorted.join(toronto_merged.set_index('Neighborhood'), on='Neighborhood')
toronto_merged

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,PostalCode,Borough,Latitude,Longitude
0,0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,Hotel,Cosmetics Shop,Thai Restaurant,American Restaurant,Burger Joint,Restaurant,M5H,Downtown Toronto,43.650571,-79.384568
1,0,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Sushi Restaurant,Bridal Shop,Supermarket,Bank,Deli / Bodega,Middle Eastern Restaurant,Fast Food Restaurant,Restaurant,Fried Chicken Joint,M3H,North York,43.754328,-79.442259
2,0,Bayview Village,Chinese Restaurant,Bank,Café,Japanese Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,M2K,North York,43.786947,-79.385975
3,0,"Bedford Park,Lawrence Manor East",Coffee Shop,Italian Restaurant,Sushi Restaurant,Pizza Place,Boutique,Breakfast Spot,Juice Bar,Restaurant,Fast Food Restaurant,Pub,M5M,North York,43.733283,-79.41975
4,0,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Seafood Restaurant,Steakhouse,Italian Restaurant,Café,Farmers Market,Bakery,Beer Bar,M5E,Downtown Toronto,43.644771,-79.373306
5,0,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Garden,Skate Park,Restaurant,Pizza Place,Park,Garden Center,Fast Food Restaurant,Spa,M7Y,East Toronto,43.662744,-79.321558
6,3,"CFB Toronto,Downsview East",Park,Airport,Yoga Studio,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,M3K,North York,43.737473,-79.464763
7,0,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Terminal,Airport Lounge,Airport Service,Sculpture Garden,Boat or Ferry,Plane,Coffee Shop,Boutique,Bar,Airport Gate,M5V,Downtown Toronto,43.628947,-79.39442
8,0,"Cabbagetown,St. James Town",Coffee Shop,Café,Pub,Bakery,Italian Restaurant,Pizza Place,Restaurant,General Entertainment,Indian Restaurant,Beer Store,M4X,Downtown Toronto,43.667967,-79.367675
9,0,Central Bay Street,Coffee Shop,Middle Eastern Restaurant,Italian Restaurant,Café,Sandwich Place,Ice Cream Shop,Burger Joint,Gym / Fitness Center,Japanese Restaurant,Spa,M5G,Downtown Toronto,43.657952,-79.387383


In [None]:
#Lastly, let's visualize the resulting clusters

In [317]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'],toronto_merged['Longitude'],toronto_merged['Neighborhood'],toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster Label' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng], radius = 5, popup = label, color = rainbow[cluster-1], fill=True, fill_color = rainbow[cluster-1],fill_opacity=0.7).add_to(map_clusters)

map_clusters