In [3]:
# The code was removed by Watson Studio for sharing.

# Clustering of Neighborhoods in Toronto
This notebook uses the Foursquare API to explore neighborhoods in Toronto.

### Import libraries

In [4]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # install folium library
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # install geopy library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  52.45 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  33.86 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  40.03 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  48.17 MB/s


### Load and check data
Load data that has been saved in project

In [72]:
neigh = pd.read_csv(project.get_file('neigh and coord.csv'))
neigh.shape

(103, 5)

Check the number of boroughs and neighborhoods

In [6]:
print('The dataframe neigh has {} boroughs and {} neighborhoods.'.format(
        len(neigh['Borough'].unique()),
        len(neigh['Neighborhood'].unique())
    )
)

The dataframe neigh has 11 boroughs and 103 neighborhoods.


Get the coordinates of Toronto

In [7]:
address = 'Toronto, ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create a map of Toronto with neighborhoods superimposed on top

In [8]:
# create map of Toronto using latitude and longitude values
map_trt = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neigh['Latitude'], neigh['Longitude'], neigh['Borough'], neigh['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_trt)  
    
map_trt

### Explore neighborhoods using Foursquare API
Define Foursquare credentials and version

In [9]:
# The code was removed by Watson Studio for sharing.

Create a function getNearbyVenues() to get the top 100 venues within a radius of 1.5 km for each neighborhoods in Toronto

In [110]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the above function on each neighborhood

In [111]:
venues = getNearbyVenues(names=neigh['Neighborhood'],
                                   latitudes=neigh['Latitude'],
                                   longitudes=neigh['Longitude']
                                  )

Check the venues dataframe 

In [112]:
print(venues.shape)
print('There are {} uniques categories in total.'.format(len(venues['Venue Category'].unique())))
venues.head()

(6739, 7)
There are 341 uniques categories in total.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Rouge, Malvern",43.806686,-79.194353,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
2,"Rouge, Malvern",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
3,"Rouge, Malvern",43.806686,-79.194353,LCBO,43.796671,-79.204586,Liquor Store
4,"Rouge, Malvern",43.806686,-79.194353,Harvey's,43.800106,-79.198258,Fast Food Restaurant


### Analyze venue category in each neighborhood
One hot encoding

In [113]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix='', prefix_sep='')

# move neighborhood column to front
cols = onehot.columns.tolist()
cols.insert(0, cols.pop(cols.index('Neighborhood')))
onehot = onehot.reindex(columns= cols)

# bring back neighborhood value
onehot['Neighborhood'] = venues['Neighborhood']
onehot.head(3)

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Occurence frequency of each venue category in each neighborhood

In [114]:
venues_f = onehot.groupby('Neighborhood').mean().reset_index()
print(venues_f.shape)
venues_f.head()

(103, 341)


Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.025641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.022222,0.0,0.0,0.0,0.0


Define a function that return top venue categories in terms of occurence frequency

In [115]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Find the top 10 venue category for each neighborhood

In [116]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = venues_f['Neighborhood']

for ind in np.arange(venues_f.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(venues_f.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Hotel,Café,Theater,Pizza Place,Japanese Restaurant,Thai Restaurant,Gastropub,Restaurant,Concert Hall
1,Agincourt,Chinese Restaurant,Coffee Shop,Japanese Restaurant,Shopping Mall,Gym / Fitness Center,Hong Kong Restaurant,Breakfast Spot,Bakery,Caribbean Restaurant,Park
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,Pharmacy,BBQ Joint,Dessert Shop,Bubble Tea Shop,Coffee Shop,Pizza Place,Tea Room,Korean Restaurant,Bakery
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Coffee Shop,Fast Food Restaurant,Pizza Place,Grocery Store,Pharmacy,Bus Line,Electronics Store,Steakhouse,Fried Chicken Joint,Beer Store
4,"Alderwood, Long Branch",Pizza Place,Café,Burger Joint,Pharmacy,Park,Toy / Game Store,Coffee Shop,Grocery Store,Beer Store,Pub


### Cluster neighborhoods
Run k-means to cluster the neighborhoods into 5 clusters

In [145]:
# set number of clusters
kclusters = 5

clustering_trt = venues_f.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clustering_trt)

# Bring neighborhood back and check
label = pd.DataFrame({'Neighborhood':venues_f['Neighborhood'],'Label':kmeans.labels_})
label = label[['Neighborhood','Label']]
label.head()

Unnamed: 0,Neighborhood,Label
0,"Adelaide, King, Richmond",0
1,Agincourt,1
2,"Agincourt North, L'Amoreaux East, Milliken, St...",1
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",1
4,"Alderwood, Long Branch",3


Create a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood.

In [146]:
# Merge coordinate, venues and label
trt_merged = pd.merge(neigh,pd.merge(neighborhoods_venues_sorted, label, on='Neighborhood'),on='Neighborhood')
print(trt_merged.shape)
trt_merged.head()

(103, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Label
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Zoo Exhibit,Fast Food Restaurant,Coffee Shop,Pizza Place,Spa,Greek Restaurant,Fruit & Vegetable Store,Caribbean Restaurant,Women's Store,Liquor Store,1
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Hotel,Italian Restaurant,Breakfast Spot,Pizza Place,Grocery Store,Gym,Burger Joint,Park,Dumpling Restaurant,Eastern European Restaurant,3
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Fast Food Restaurant,Pizza Place,Breakfast Spot,Coffee Shop,Bus Station,Smoothie Shop,Food & Drink Shop,Sandwich Place,Liquor Store,Asian Restaurant,1
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Coffee Shop,Pharmacy,Pizza Place,Sandwich Place,Indian Restaurant,Filipino Restaurant,Bank,Thrift / Vintage Store,Music Store,Juice Bar,1
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Coffee Shop,Clothing Store,Sandwich Place,Indian Restaurant,Food Court,Fast Food Restaurant,Wings Joint,Bakery,Sporting Goods Shop,Bus Station,1


Visualize clusters

In [147]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(trt_merged['Latitude'], trt_merged['Longitude'], trt_merged['Neighborhood'], trt_merged['Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters
Examine each cluster and determine the discriminating venue categories that distinguish each cluster
#### Cluster 0

In [148]:
trt_merged.loc[trt_merged['Label'] == 0, trt_merged.columns[[2] + list(range(5, trt_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Label
11,"Maryvale, Wexford",Middle Eastern Restaurant,Coffee Shop,Grocery Store,Restaurant,Mediterranean Restaurant,Pizza Place,Asian Restaurant,Breakfast Spot,Plaza,Bar,0
21,"Newtonbrook, Willowdale",Korean Restaurant,Coffee Shop,Bubble Tea Shop,Shopping Mall,Japanese Restaurant,Café,Fast Food Restaurant,Dessert Shop,Bank,Grocery Store,0
22,Willowdale South,Korean Restaurant,Bubble Tea Shop,Japanese Restaurant,Coffee Shop,Pizza Place,Ramen Restaurant,Café,Grocery Store,Dessert Shop,Fried Chicken Joint,0
26,Don Mills North,Coffee Shop,Japanese Restaurant,Bank,Burger Joint,Restaurant,Pizza Place,Park,Asian Restaurant,Middle Eastern Restaurant,Café,0
27,"Flemingdon Park, Don Mills South",Coffee Shop,Japanese Restaurant,Gym,Middle Eastern Restaurant,Restaurant,Sandwich Place,Movie Theater,Asian Restaurant,Electronics Store,Fast Food Restaurant,0
37,The Beaches,Coffee Shop,Pub,Beach,Japanese Restaurant,Bakery,Breakfast Spot,Bar,Ice Cream Shop,Sandwich Place,Grocery Store,0
38,Leaside,Coffee Shop,Indian Restaurant,Grocery Store,Bakery,Restaurant,Burger Joint,Supermarket,Sporting Goods Shop,Thai Restaurant,Sushi Restaurant,0
39,Thorncliffe Park,Coffee Shop,Grocery Store,Sandwich Place,Restaurant,Sporting Goods Shop,Convenience Store,Indian Restaurant,Burger Joint,Electronics Store,Beer Store,0
40,East Toronto,Greek Restaurant,Pizza Place,Café,Coffee Shop,Bakery,Ice Cream Shop,Dessert Shop,Thai Restaurant,Brewery,Italian Restaurant,0
41,"The Danforth West, Riverdale",Greek Restaurant,Café,Park,Pizza Place,Coffee Shop,Bakery,Pub,Ice Cream Shop,Vietnamese Restaurant,Italian Restaurant,0


#### Cluster 1

In [149]:
trt_merged.loc[trt_merged['Label'] == 1, trt_merged.columns[[2] + list(range(5, trt_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Label
0,"Rouge, Malvern",Zoo Exhibit,Fast Food Restaurant,Coffee Shop,Pizza Place,Spa,Greek Restaurant,Fruit & Vegetable Store,Caribbean Restaurant,Women's Store,Liquor Store,1
2,"Guildwood, Morningside, West Hill",Fast Food Restaurant,Pizza Place,Breakfast Spot,Coffee Shop,Bus Station,Smoothie Shop,Food & Drink Shop,Sandwich Place,Liquor Store,Asian Restaurant,1
3,Woburn,Coffee Shop,Pharmacy,Pizza Place,Sandwich Place,Indian Restaurant,Filipino Restaurant,Bank,Thrift / Vintage Store,Music Store,Juice Bar,1
4,Cedarbrae,Coffee Shop,Clothing Store,Sandwich Place,Indian Restaurant,Food Court,Fast Food Restaurant,Wings Joint,Bakery,Sporting Goods Shop,Bus Station,1
5,Scarborough Village,Fast Food Restaurant,Sandwich Place,Pharmacy,Pizza Place,Breakfast Spot,Chinese Restaurant,Coffee Shop,Bank,Restaurant,Bookstore,1
6,"East Birchmount Park, Ionview, Kennedy Park",Coffee Shop,Fast Food Restaurant,Pizza Place,Sandwich Place,Chinese Restaurant,Train Station,Pharmacy,Grocery Store,Bank,Cafeteria,1
7,"Clairlea, Golden Mile, Oakridge",Coffee Shop,Pizza Place,Grocery Store,Diner,Fast Food Restaurant,Sandwich Place,Convenience Store,Park,Department Store,Dessert Shop,1
10,"Dorset Park, Scarborough Town Centre, Wexford ...",Fast Food Restaurant,Coffee Shop,Electronics Store,Chinese Restaurant,Grocery Store,Pizza Place,Burger Joint,Intersection,Wings Joint,Pet Store,1
12,Agincourt,Chinese Restaurant,Coffee Shop,Japanese Restaurant,Shopping Mall,Gym / Fitness Center,Hong Kong Restaurant,Breakfast Spot,Bakery,Caribbean Restaurant,Park,1
13,"Clarks Corners, Sullivan, Tam O'Shanter",Fast Food Restaurant,Korean Restaurant,Vietnamese Restaurant,Chinese Restaurant,Bakery,Coffee Shop,Park,Sandwich Place,Pizza Place,Falafel Restaurant,1


#### Cluster 2

In [150]:
trt_merged.loc[trt_merged['Label'] == 2, trt_merged.columns[[2] + list(range(5, trt_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Label
16,Upper Rouge,National Park,Donut Shop,Farm,Eye Doctor,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,2


#### Cluster 3

In [137]:
trt_merged.loc[trt_merged['Label'] == 3, trt_merged.columns[[2] + list(range(5, trt_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Label
1,"Highland Creek, Rouge Hill, Port Union",Hotel,Italian Restaurant,Breakfast Spot,Pizza Place,Grocery Store,Gym,Burger Joint,Park,Dumpling Restaurant,Eastern European Restaurant,3
8,"Cliffcrest, Cliffside, Scarborough Village West",Harbor / Marina,Park,Fast Food Restaurant,Pharmacy,Pizza Place,Sandwich Place,Grocery Store,Beach,Discount Store,Sushi Restaurant,3
9,"Birch Cliff, Cliffside West",Park,College Stadium,Thai Restaurant,Gym,Golf Course,General Entertainment,Filipino Restaurant,Fast Food Restaurant,Diner,Restaurant,3
19,Bayview Village,Trail,Bank,Intersection,Chinese Restaurant,Park,Shopping Mall,Grocery Store,Skating Rink,Japanese Restaurant,Café,3
28,"Bathurst Manor, Downsview North, Wilson Heights",Park,Coffee Shop,Pizza Place,Bank,Shopping Mall,Gas Station,Baseball Field,Restaurant,Sushi Restaurant,Frozen Yogurt Shop,3
30,"CFB Toronto, Downsview East",Athletics & Sports,Gym / Fitness Center,Turkish Restaurant,Park,Beer Store,Jewelry Store,Liquor Store,Racetrack,Department Store,Gym,3
35,"Woodbine Gardens, Parkview Hill",Pizza Place,Fast Food Restaurant,Pharmacy,Athletics & Sports,Brewery,Convenience Store,Park,Bakery,BBQ Joint,Martial Arts Dojo,3
36,Woodbine Heights,Grocery Store,Coffee Shop,Pizza Place,Gastropub,Thai Restaurant,Sandwich Place,Pharmacy,Park,Bank,Bakery,3
68,"CN Tower, Bathurst Quay, Island airport, Harbo...",Park,Coffee Shop,Café,Gym,Scenic Lookout,Historic Site,Boat or Ferry,Harbor / Marina,Hotel,Track,3
88,"Humber Bay Shores, Mimico South, New Toronto",Park,Café,Pharmacy,Bakery,Grocery Store,Indian Restaurant,Coffee Shop,Supermarket,General Entertainment,Mexican Restaurant,3


#### Cluster 4

In [151]:
trt_merged.loc[trt_merged['Label'] == 4, trt_merged.columns[[2] + list(range(5, trt_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Label
31,Downsview West,Tea Room,Plaza,Pizza Place,Bank,Vietnamese Restaurant,Moving Target,Park,Ethiopian Restaurant,Donut Shop,Drugstore,4
