## Importing relevant functionalities

In [35]:
import numpy as np 
import pandas as pd 
import json

#--- library to handle requests
import requests 

#--- tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

#--- import k-means from clustering stage
from sklearn.cluster import KMeans

#--- map rendering library
import folium

import matplotlib.cm as cm
import matplotlib.colors as colors


In [2]:
neighborhoods = pd.read_csv('Toronto.csv')
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### How many boroughs and Neighborhoods does Toronto have?

In [3]:
print('Toronto has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

Toronto has 10 boroughs and 103 neighborhoods.


### Create a map of Toronto with neighborhoods superimposed on top.

In [7]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Let's group the neighborhoods dataframe by boroughs and analyze each separately

In [4]:
unique_boroughs = neighborhoods['Borough'].unique()
print('List of Boroughs in Toronto : ', unique_boroughs)
borough_groups = neighborhoods.groupby('Borough')

List of Boroughs in Toronto :  ['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


#### How to access a borough group?
Let's get the group for 'East York'

In [5]:
EY = borough_groups.get_group('East York').reset_index(drop=True)
EY

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,East Toronto,43.685347,-79.338106


### Define Foursquare Credentials and Version


In [6]:
CLIENT_ID = 'CQTF5U0CCI2SPBLT5N3TO2HYTFJFA0UOL2SNNXW2SVCDZXMK' 
CLIENT_SECRET = 'FPQAH1O2DPPX0LA4A01GS5DGWHZAL1L4KHVKAZ3M3XJQBEGM' 
VERSION = '20180605' 

### Let's get the top 100 venues that are in each neighborhood of East York within a radius of 500 m.

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([[
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']] for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
bor_venues = getNearbyVenues(names=EY['Neighborhood'],
                                   latitudes=EY['Latitude'],
                                   longitudes=EY['Longitude']
                                  )

Parkview Hill, Woodbine Gardens
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto


In [10]:
print(bor_venues.shape)
bor_venues.head()

(77, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub
1,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center
2,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy
3,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank
4,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place


#### Let's check how many venues were returned for each neighborhood

In [11]:
bor_venues[['Neighborhood','Venue']].groupby('Neighborhood').count()

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
East Toronto,4
Leaside,34
"Parkview Hill, Woodbine Gardens",12
Thorncliffe Park,18
Woodbine Heights,9


In [12]:
print('There are {} uniques categories'.format(len(bor_venues['Venue Category'].unique())))

There are 45 uniques categories.


#### let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [21]:
# one hot encoding
toronto_onehot = pd.get_dummies(bor_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = bor_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Asian Restaurant,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,...,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Warehouse Store,Yoga Studio
0,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Parkview Hill, Woodbine Gardens",0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Parkview Hill, Woodbine Gardens",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
toronto_onehot.shape

(77, 46)

In [23]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Asian Restaurant,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,...,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Warehouse Store,Yoga Studio
0,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Leaside,0.0,0.0,0.029412,0.058824,0.029412,0.029412,0.029412,0.029412,0.058824,...,0.029412,0.029412,0.058824,0.0,0.088235,0.029412,0.029412,0.029412,0.0,0.0
2,"Parkview Hill, Woodbine Gardens",0.0,0.083333,0.0,0.083333,0.0,0.0,0.083333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Thorncliffe Park,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,...,0.055556,0.111111,0.0,0.0,0.0,0.0,0.055556,0.0,0.055556,0.055556
4,Woodbine Heights,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0


#### let's create the new dataframe and display the top 10 venues for each neighborhood.

In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,Coffee Shop,Convenience Store,Metro Station,Park,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Diner,Dessert Shop,Department Store
1,Leaside,Sporting Goods Shop,Coffee Shop,Furniture / Home Store,Shopping Mall,Burger Joint,Bank,Grocery Store,Dessert Shop,Electronics Store,Fish & Chips Shop
2,"Parkview Hill, Woodbine Gardens",Pizza Place,Pet Store,Gastropub,Gym / Fitness Center,Intersection,Café,Pharmacy,Fast Food Restaurant,Breakfast Spot,Athletics & Sports
3,Thorncliffe Park,Sandwich Place,Indian Restaurant,Yoga Studio,Park,Bank,Burger Joint,Coffee Shop,Fast Food Restaurant,Gas Station,Warehouse Store
4,Woodbine Heights,Skating Rink,Asian Restaurant,Athletics & Sports,Beer Store,Diner,Pharmacy,Curling Ice,Park,Convenience Store,Fish & Chips Shop


## 4. Cluster Neighborhoods
Run *k*-means to cluster the neighborhood into 5 clusters.
Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [25]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 3, 4, 0])

In [29]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = bor_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub,3,Pizza Place,Pet Store,Gastropub,Gym / Fitness Center,Intersection,Café,Pharmacy,Fast Food Restaurant,Breakfast Spot,Athletics & Sports
1,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center,3,Pizza Place,Pet Store,Gastropub,Gym / Fitness Center,Intersection,Café,Pharmacy,Fast Food Restaurant,Breakfast Spot,Athletics & Sports
2,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy,3,Pizza Place,Pet Store,Gastropub,Gym / Fitness Center,Intersection,Café,Pharmacy,Fast Food Restaurant,Breakfast Spot,Athletics & Sports
3,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank,3,Pizza Place,Pet Store,Gastropub,Gym / Fitness Center,Intersection,Café,Pharmacy,Fast Food Restaurant,Breakfast Spot,Athletics & Sports
4,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place,3,Pizza Place,Pet Store,Gastropub,Gym / Fitness Center,Intersection,Café,Pharmacy,Fast Food Restaurant,Breakfast Spot,Athletics & Sports


### Visulaizing on Folium Map


In [34]:
# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Venue Latitude'], toronto_merged['Venue Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Lets perform the same analysis on all boroughs by looping over unique boroughs

In [47]:
#--- iterate over unique boroughs in Toronto
maps = []
for bor in unique_boroughs:
    
    #--- get the a slice of postal codes for borough 'bor'
    bf = borough_groups.get_group(bor).reset_index(drop=True)
    
    #--- get the top 100 venues that are in each neighborhood of borough 'bor' within a 0.5 Km radius 
    bor_venues = getNearbyVenues(names=bf['Neighborhood'],latitudes=bf['Latitude'],longitudes=bf['Longitude'])
    
    print('There are {} uniques categories in borough {}'.format(len(bor_venues['Venue Category'].unique()),bor))
    
    # one hot encoding
    toronto_onehot = pd.get_dummies(bor_venues[['Venue Category']], prefix="", prefix_sep="")

    # add neighborhood column back to dataframe
    toronto_onehot['Neighborhood'] = bor_venues['Neighborhood'] 

    # move neighborhood column to the first column
    fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
    toronto_onehot = toronto_onehot[fixed_columns]
    
    toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
    
    num_top_venues = 10

    indicators = ['st', 'nd', 'rd']

    # create columns according to number of top venues
    columns = ['Neighborhood']
    for ind in np.arange(num_top_venues):
        try:
            columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            columns.append('{}th Most Common Venue'.format(ind+1))

    # create a new dataframe
    neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
    neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

    # set number of clusters
    kclusters = 5

    toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

    # check cluster labels generated for each row in the dataframe
    kmeans.labels_[0:10] 
    
    # add clustering labels
    #neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

    toronto_merged = bor_venues

    # merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
    toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
    print(toronto_merged.head())
    
    # create map
    map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

    # set color scheme for the clusters
    x = np.arange(kclusters)
    ys = [i + x + (i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map
    markers_colors = []
    for lat, lon, poi, cluster in zip(toronto_merged['Venue Latitude'], toronto_merged['Venue Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[int(cluster)-1],
            fill=True,
            fill_color=rainbow[int(cluster)-1],
            fill_opacity=0.7).add_to(map_clusters)

    maps.append(map_clusters)

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview
York Mills, Silver Hills
Downsview
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale
Downsview
York Mills West
Willowdale
There are 98 uniques categories in borough North York
Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Ros

ValueError: n_samples=1 should be >= n_clusters=5