This notebook is the final project of Applied Data Capstone named battle of neighbourhood.

Import required libraries.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

Get the source from wikipedia

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source,"lxml")

Scrape the table from wikipedia. Create a couple of lists first, then transform into dictionary, then dataframe. 

In [3]:
tables = soup.find('table', class_="wikitable sortable")
key = []
for heads in tables.find_all("th"):
    key.append(heads.text.strip("\n"))

value = []
for x in tables.find_all("td"):
    value.append(x.text.strip("\n"))

v1 = []
v2 = []
v3 = []
for i in range(0,len(value)):
    if i%3 == 0:
        v1.append(value[i])
    elif i%3 == 1:
        v2.append(value[i])
    else:
        v3.append(value[i])
 
d = {key[0]:v1,key[1]:v2,key[2]:v3}
df = pd.DataFrame(d)

Drop cells with a borough that is Not assigned.

In [4]:
df = df[-df["Borough"].isin(["Not assigned"])]
df.reset_index(drop=True, inplace=True)

Set the "not assigned" neighbourhood as the same to it's borough

In [5]:
for i in range(0,len(df)):
    if df["Neighbourhood"][i] == "Not assigned":
        df["Neighbourhood"][i] = df["Borough"][i]

Combine rows with same Postcode. The neighborhoods are separated by comma.

In [6]:
df_grp = df.groupby(["Postcode","Borough"], as_index=False).agg(','.join)
df_grp

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


Get dataframe of location from URL.

In [7]:
URL = "http://cocl.us/Geospatial_data"
df_gps = pd.read_csv(URL)

Append latitude and longitude to the postcode.

In [8]:
df_gps.columns=['Postcode','Latitude','Longitude']
df_toronto = pd.merge(df_grp, df_gps[['Postcode','Latitude', 'Longitude']], on='Postcode')
df_toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [9]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [10]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare Credentials and Version

In [11]:
CLIENT_ID = '102ANYEA4PQ4E5QH5BMJPRCCCKYCTEH3RNGTJBVFJIRJX2IJ' # your Foursquare ID
CLIENT_SECRET = 'AGPZLMZWH2JFJGVL5XLHGYUWDSQBD2WHXEYBLV0HKYVBE5A1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 102ANYEA4PQ4E5QH5BMJPRCCCKYCTEH3RNGTJBVFJIRJX2IJ
CLIENT_SECRET:AGPZLMZWH2JFJGVL5XLHGYUWDSQBD2WHXEYBLV0HKYVBE5A1


In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
LIMIT = 100
radius = 500
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))


Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West,Riverdale
The Beaches West,Indi

In [14]:
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 280 uniques categories.


Analyze Each Neighborhood

In [15]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [16]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

top 5 most common venues

In [17]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                 venue  freq
0                 Café  0.05
1          Coffee Shop  0.05
2                  Bar  0.04
3           Steakhouse  0.04
4  American Restaurant  0.04


----Agincourt----
                venue  freq
0      Sandwich Place  0.25
1              Lounge  0.25
2      Breakfast Spot  0.25
3  Chinese Restaurant  0.25
4         Yoga Studio  0.00


----Agincourt North,L'Amoreaux East,Milliken,Steeles East----
                       venue  freq
0                 Playground  0.33
1           Asian Restaurant  0.33
2                       Park  0.33
3                Yoga Studio  0.00
4  Middle Eastern Restaurant  0.00


----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                  venue  freq
0         Grocery Store  0.18
1           Pizza Place  0.09
2           Coffee Shop  0.09
3        Sandwich Place  0.09
4  Fast Food Restaurant  0.09


----Alderwood,Long Branch----
        

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [19]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Gym,Cosmetics Shop,Hotel,Burger Joint,Restaurant
1,Agincourt,Chinese Restaurant,Lounge,Sandwich Place,Breakfast Spot,Women's Store,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Park,Asian Restaurant,Playground,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Liquor Store,Sandwich Place,Fried Chicken Joint,Video Store,Coffee Shop,Pharmacy,Pizza Place,Beer Store,Fast Food Restaurant
4,"Alderwood,Long Branch",Pizza Place,Coffee Shop,Gym,Skating Rink,Pharmacy,Pub,Dance Studio,Pool,Sandwich Place,Women's Store


##Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [20]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 1, 0, 0, 0, 3, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [21]:
# add clustering labels
if 'Cluster Labels' not in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged = toronto_merged[-toronto_merged["Cluster Labels"].isin(["NaN"])]
toronto_merged.reset_index(drop=True, inplace=True)
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,0.0,Fast Food Restaurant,Print Shop,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,0.0,Bar,Construction & Landscaping,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,0.0,Medical Center,Pizza Place,Intersection,Tech Startup,Mexican Restaurant,Breakfast Spot,Rental Car Location,Spa,Electronics Store,Dumpling Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Korean Restaurant,Convenience Store,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Athletics & Sports,Thai Restaurant,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Lounge,Hakka Restaurant,Cosmetics Shop,Costume Shop


Finally, let's visualize the resulting clusters

In [22]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [23]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0.0,Fast Food Restaurant,Print Shop,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,Scarborough,0.0,Bar,Construction & Landscaping,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,Scarborough,0.0,Medical Center,Pizza Place,Intersection,Tech Startup,Mexican Restaurant,Breakfast Spot,Rental Car Location,Spa,Electronics Store,Dumpling Restaurant
3,Scarborough,0.0,Coffee Shop,Korean Restaurant,Convenience Store,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
4,Scarborough,0.0,Athletics & Sports,Thai Restaurant,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Lounge,Hakka Restaurant,Cosmetics Shop,Costume Shop
5,Scarborough,0.0,Playground,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
6,Scarborough,0.0,Coffee Shop,Playground,Discount Store,Department Store,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Dog Run,Doner Restaurant
7,Scarborough,0.0,Bakery,Bus Line,Soccer Field,Metro Station,Bus Station,Intersection,Fast Food Restaurant,Park,Electronics Store,Eastern European Restaurant
8,Scarborough,0.0,Motel,American Restaurant,Women's Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
9,Scarborough,0.0,College Stadium,General Entertainment,Skating Rink,Café,Comic Shop,Concert Hall,Event Space,Ethiopian Restaurant,Empanada Restaurant,Colombian Restaurant


In [24]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Scarborough,1.0,Park,Asian Restaurant,Playground,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
21,North York,1.0,Park,Convenience Store,Bank,Bar,Women's Store,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop
23,North York,1.0,Fast Food Restaurant,Park,Food & Drink Shop,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Department Store
28,North York,1.0,Park,Other Repair Shop,Airport,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
38,East York,1.0,Park,Coffee Shop,Convenience Store,Women's Store,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
42,Central Toronto,1.0,Park,Bus Line,Swim School,Women's Store,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
48,Downtown Toronto,1.0,Park,Playground,Trail,Ethiopian Restaurant,Empanada Restaurant,Event Space,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Deli / Bodega
62,Central Toronto,1.0,Park,Jewelry Store,Sushi Restaurant,Trail,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store
70,North York,1.0,Park,Pizza Place,Japanese Restaurant,Pub,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
72,York,1.0,Park,Women's Store,Pharmacy,Fast Food Restaurant,Market,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run


In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
89,Etobicoke,2.0,Baseball Field,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Women's Store,Dessert Shop
94,North York,2.0,Baseball Field,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Women's Store,Dessert Shop


In [26]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,North York,3.0,Café,Japanese Restaurant,Bank,Chinese Restaurant,Dim Sum Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
91,Etobicoke,3.0,Bank,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Dessert Shop


In [27]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,North York,4.0,Cafeteria,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant


Refine *k*-means to cluster the neighborhood into 5 clusters. #1

In [28]:
for j in range(1,5):
    d1 = toronto_merged[toronto_merged["Cluster Labels"].isin([j])]
    d1.reset_index(drop=True, inplace=True)
    for i in range(0,len(d1)):
        df_toronto=df_toronto[-df_toronto['Neighbourhood'].isin([d1['Neighbourhood'][i]])]
df_toronto.reset_index(drop=True, inplace=True)
print(df_toronto)

   Postcode           Borough  \
0       M1B       Scarborough   
1       M1C       Scarborough   
2       M1E       Scarborough   
3       M1G       Scarborough   
4       M1H       Scarborough   
5       M1J       Scarborough   
6       M1K       Scarborough   
7       M1L       Scarborough   
8       M1M       Scarborough   
9       M1N       Scarborough   
10      M1P       Scarborough   
11      M1R       Scarborough   
12      M1S       Scarborough   
13      M1T       Scarborough   
14      M1W       Scarborough   
15      M1X       Scarborough   
16      M2H        North York   
17      M2J        North York   
18      M2M        North York   
19      M2N        North York   
20      M2R        North York   
21      M3B        North York   
22      M3C        North York   
23      M3H        North York   
24      M3J        North York   
25      M3L        North York   
26      M3M        North York   
27      M3N        North York   
28      M4A        North York   
29      M4

In [29]:
LIMIT = 100
radius = 500
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))


Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Newtonbrook,Willowdale
Willowdale South
Willowdale West
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West

In [30]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [31]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

top 5 most common venues

In [32]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3  American Restaurant  0.04
4                  Bar  0.04


----Agincourt----
                venue  freq
0              Lounge  0.25
1      Sandwich Place  0.25
2      Breakfast Spot  0.25
3  Chinese Restaurant  0.25
4         Yoga Studio  0.00


----Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown----
                  venue  freq
0         Grocery Store  0.18
1           Pizza Place  0.09
2  Fast Food Restaurant  0.09
3   Fried Chicken Joint  0.09
4        Sandwich Place  0.09


----Alderwood,Long Branch----
            venue  freq
0     Pizza Place   0.2
1        Pharmacy   0.1
2             Gym   0.1
3    Skating Rink   0.1
4  Sandwich Place   0.1


----Bathurst Manor,Downsview North,Wilson Heights----
                       venue  freq
0                Coffee Shop  0.11
1           

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [34]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,American Restaurant,Steakhouse,Thai Restaurant,Gym,Burger Joint,Hotel,Restaurant
1,Agincourt,Lounge,Chinese Restaurant,Sandwich Place,Breakfast Spot,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
2,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Coffee Shop,Beer Store,Video Store,Pharmacy,Pizza Place,Liquor Store,Fried Chicken Joint,Fast Food Restaurant,Sandwich Place
3,"Alderwood,Long Branch",Pizza Place,Gym,Coffee Shop,Skating Rink,Pub,Pharmacy,Dance Studio,Sandwich Place,Pool,Diner
4,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Middle Eastern Restaurant,Pizza Place,Deli / Bodega,Shopping Mall,Bank,Fried Chicken Joint,Frozen Yogurt Shop,Restaurant,Diner


##Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [35]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 0, 0, 2, 2, 2, 2, 0, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [36]:
# add clustering labels
if 'Cluster Labels' not in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged = toronto_merged[-toronto_merged["Cluster Labels"].isin(["NaN"])]
toronto_merged.reset_index(drop=True, inplace=True)
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,0.0,Fast Food Restaurant,Print Shop,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,2.0,Bar,Construction & Landscaping,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,2.0,Electronics Store,Breakfast Spot,Tech Startup,Intersection,Medical Center,Mexican Restaurant,Rental Car Location,Pizza Place,Spa,Donut Shop
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Coffee Shop,Korean Restaurant,Convenience Store,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2.0,Hakka Restaurant,Thai Restaurant,Lounge,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Coworking Space,Creperie


Finally, let's visualize the resulting clusters

In [37]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0.0,Fast Food Restaurant,Print Shop,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
13,Scarborough,0.0,Pizza Place,Thai Restaurant,Italian Restaurant,Fast Food Restaurant,Pharmacy,Noodle House,Fried Chicken Joint,Bank,Chinese Restaurant,Women's Store
14,Scarborough,0.0,Fast Food Restaurant,Chinese Restaurant,Coffee Shop,Thrift / Vintage Store,Pharmacy,Pizza Place,Sandwich Place,Breakfast Spot,American Restaurant,Grocery Store
18,North York,0.0,Grocery Store,Pharmacy,Pizza Place,Coffee Shop,Discount Store,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner
25,North York,0.0,Gym / Fitness Center,Discount Store,Athletics & Sports,Liquor Store,Grocery Store,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant
27,East York,0.0,Pizza Place,Fast Food Restaurant,Gym / Fitness Center,Café,Bank,Athletics & Sports,Gastropub,Pharmacy,Pet Store,Breakfast Spot
67,York,0.0,Convenience Store,Grocery Store,Pizza Place,Bus Line,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
75,Etobicoke,0.0,Pizza Place,Gym,Coffee Shop,Skating Rink,Pub,Pharmacy,Dance Studio,Sandwich Place,Pool,Diner
77,Etobicoke,0.0,Café,Beer Store,Liquor Store,Pharmacy,Pizza Place,Convenience Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store
78,North York,0.0,Pharmacy,Pizza Place,Empanada Restaurant,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run


In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Scarborough,1.0,Coffee Shop,Korean Restaurant,Convenience Store,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
72,Mississauga,1.0,Hotel,Coffee Shop,Gym / Fitness Center,American Restaurant,Burrito Place,Sandwich Place,Mediterranean Restaurant,Fried Chicken Joint,Dumpling Restaurant,Drugstore


In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,2.0,Bar,Construction & Landscaping,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
2,Scarborough,2.0,Electronics Store,Breakfast Spot,Tech Startup,Intersection,Medical Center,Mexican Restaurant,Rental Car Location,Pizza Place,Spa,Donut Shop
4,Scarborough,2.0,Hakka Restaurant,Thai Restaurant,Lounge,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Coworking Space,Creperie
6,Scarborough,2.0,Department Store,Coffee Shop,Playground,Discount Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run,Doner Restaurant
7,Scarborough,2.0,Bus Line,Bakery,Soccer Field,Intersection,Metro Station,Bus Station,Fast Food Restaurant,Park,General Travel,General Entertainment
8,Scarborough,2.0,Motel,American Restaurant,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
9,Scarborough,2.0,College Stadium,General Entertainment,Skating Rink,Café,Comic Shop,Dessert Shop,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Colombian Restaurant
10,Scarborough,2.0,Indian Restaurant,Chinese Restaurant,Pet Store,Vietnamese Restaurant,Latin American Restaurant,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
11,Scarborough,2.0,Middle Eastern Restaurant,Shopping Mall,Bakery,Sandwich Place,Breakfast Spot,Auto Garage,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Event Space
12,Scarborough,2.0,Lounge,Chinese Restaurant,Sandwich Place,Breakfast Spot,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore


In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
52,Central Toronto,3.0,Ice Cream Shop,Garden,Women's Store,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough,4.0,Playground,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
38,Central Toronto,4.0,Tennis Court,Playground,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop


Refine *k*-means to cluster the neighborhood into 5 clusters. #2

In [43]:
for j in [0,1,3,4]:
    d1 = toronto_merged[toronto_merged["Cluster Labels"].isin([j])]
    d1.reset_index(drop=True, inplace=True)
    for i in range(0,len(d1)):
        df_toronto=df_toronto[-df_toronto['Neighbourhood'].isin([d1['Neighbourhood'][i]])]
df_toronto.reset_index(drop=True, inplace=True)

In [44]:
LIMIT = 100
radius = 500
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))


Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Cedarbrae
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Newtonbrook,Willowdale
Willowdale South
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
Downsview West
Downsview Central
Victoria Village
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Davisville North
North Toronto West
Davisville
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toro

In [45]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [46]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

top 5 most common venues

In [47]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})


----Adelaide,King,Richmond----
----Agincourt----
----Bathurst Manor,Downsview North,Wilson Heights----
----Bedford Park,Lawrence Manor East----
----Berczy Park----
----Birch Cliff,Cliffside West----
----Brockton,Exhibition Place,Parkdale Village----
----Business Reply Mail Processing Centre 969 Eastern----
----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
----Cabbagetown,St. James Town----
----Cedarbrae----
----Central Bay Street----
----Chinatown,Grange Park,Kensington Market----
----Christie----
----Church and Wellesley----
----Clairlea,Golden Mile,Oakridge----
----Cliffcrest,Cliffside,Scarborough Village West----
----Commerce Court,Victoria Hotel----
----Davisville----
----Davisville North----
----Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West----
----Del Ray,Keelesdale,Mount Dennis,Silverthorn----
----Design Exchange,Toronto Dominion Centre----
----Don Mills North----
----Dorset Park,Scarborough Town Centre,W

In [48]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [49]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Café,Coffee Shop,American Restaurant,Steakhouse,Bar,Hotel,Gym,Cosmetics Shop,Bakery,Thai Restaurant
1,Agincourt,Lounge,Chinese Restaurant,Sandwich Place,Breakfast Spot,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
2,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Shopping Mall,Pizza Place,Deli / Bodega,Bank,Middle Eastern Restaurant,Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Bridal Shop
3,"Bedford Park,Lawrence Manor East",Coffee Shop,Italian Restaurant,Fast Food Restaurant,Pharmacy,Thai Restaurant,Pub,Indian Restaurant,Ice Cream Shop,Café,Sushi Restaurant
4,Berczy Park,Coffee Shop,Cocktail Bar,Café,Steakhouse,Seafood Restaurant,Bakery,Beer Bar,Italian Restaurant,Cheese Shop,Farmers Market


##Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [50]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 0, 0, 0, 3, 0, 3, 3, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [51]:
# add clustering labels
if 'Cluster Labels' not in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged = toronto_merged[-toronto_merged["Cluster Labels"].isin(["NaN"])]
toronto_merged.reset_index(drop=True, inplace=True)
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,4.0,Bar,Construction & Landscaping,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant
1,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,3.0,Rental Car Location,Medical Center,Electronics Store,Spa,Tech Startup,Pizza Place,Breakfast Spot,Intersection,Mexican Restaurant,Department Store
2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3.0,Hakka Restaurant,Thai Restaurant,Lounge,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Costume Shop,Coworking Space
3,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029,0.0,Department Store,Coffee Shop,Playground,Discount Store,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
4,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577,3.0,Bakery,Bus Line,Park,Metro Station,Fast Food Restaurant,Intersection,Bus Station,Soccer Field,Cosmetics Shop,Comic Shop


Finally, let's visualize the resulting clusters

In [52]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [53]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Scarborough,0.0,Department Store,Coffee Shop,Playground,Discount Store,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Dog Run
11,North York,0.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Restaurant,Tea Room,Japanese Restaurant,Metro Station,Bakery,Kids Store,Toy / Game Store
12,North York,0.0,Sushi Restaurant,Restaurant,Ramen Restaurant,Coffee Shop,Café,Japanese Restaurant,Sandwich Place,Grocery Store,Pizza Place,Ice Cream Shop
14,North York,0.0,Gym,Coffee Shop,Beer Store,Asian Restaurant,Grocery Store,Sandwich Place,Japanese Restaurant,Sporting Goods Shop,Fast Food Restaurant,Smoke Shop
15,North York,0.0,Coffee Shop,Shopping Mall,Pizza Place,Deli / Bodega,Bank,Middle Eastern Restaurant,Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Bridal Shop
16,North York,0.0,Falafel Restaurant,Massage Studio,Caribbean Restaurant,Metro Station,Bar,Miscellaneous Shop,Coffee Shop,Drugstore,Donut Shop,Dessert Shop
19,North York,0.0,Portuguese Restaurant,Intersection,Coffee Shop,Hockey Arena,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
22,East York,0.0,Sporting Goods Shop,Coffee Shop,Grocery Store,Sushi Restaurant,Burger Joint,Furniture / Home Store,Bagel Shop,Liquor Store,Supermarket,Bank
23,East York,0.0,Indian Restaurant,Yoga Studio,Supermarket,Grocery Store,Gym,Housing Development,Intersection,Liquor Store,Discount Store,Park
24,East Toronto,0.0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Bakery,Indian Restaurant,Diner,Pub,Yoga Studio


In [54]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
64,Etobicoke,1.0,Rental Car Location,Bar,Drugstore,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


In [55]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough,2.0,Motel,American Restaurant,Women's Store,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant


In [56]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,3.0,Rental Car Location,Medical Center,Electronics Store,Spa,Tech Startup,Pizza Place,Breakfast Spot,Intersection,Mexican Restaurant,Department Store
2,Scarborough,3.0,Hakka Restaurant,Thai Restaurant,Lounge,Bank,Bakery,Fried Chicken Joint,Caribbean Restaurant,Athletics & Sports,Costume Shop,Coworking Space
4,Scarborough,3.0,Bakery,Bus Line,Park,Metro Station,Fast Food Restaurant,Intersection,Bus Station,Soccer Field,Cosmetics Shop,Comic Shop
6,Scarborough,3.0,Café,General Entertainment,College Stadium,Skating Rink,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
7,Scarborough,3.0,Indian Restaurant,Latin American Restaurant,Pet Store,Vietnamese Restaurant,Chinese Restaurant,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner
8,Scarborough,3.0,Middle Eastern Restaurant,Shopping Mall,Bakery,Sandwich Place,Breakfast Spot,Auto Garage,Dumpling Restaurant,Drugstore,Donut Shop,Event Space
9,Scarborough,3.0,Lounge,Chinese Restaurant,Sandwich Place,Breakfast Spot,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
10,North York,3.0,Pool,Golf Course,Dog Run,Mediterranean Restaurant,Women's Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
13,North York,3.0,Café,Baseball Field,Gym / Fitness Center,Japanese Restaurant,Caribbean Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
17,North York,3.0,Grocery Store,Moving Target,Shopping Mall,Bank,Women's Store,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run


In [57]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,4.0,Bar,Construction & Landscaping,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant


Refine *k*-means to cluster the neighborhood into 5 clusters. #3

In [58]:
for j in [1,2,3,4]:
    d1 = toronto_merged[toronto_merged["Cluster Labels"].isin([j])]
    d1.reset_index(drop=True, inplace=True)
    for i in range(0,len(d1)):
        df_toronto=df_toronto[-df_toronto['Neighbourhood'].isin([d1['Neighbourhood'][i]])]
df_toronto.reset_index(drop=True, inplace=True)

In [59]:
LIMIT = 100
radius = 500
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))


East Birchmount Park,Ionview,Kennedy Park
Upper Rouge
Fairview,Henry Farm,Oriole
Newtonbrook,Willowdale
Willowdale South
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
Victoria Village
Leaside
Thorncliffe Park
The Danforth West,Riverdale
Studio District
North Toronto West
Davisville
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Bedford Park,Lawrence Manor East
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Lawrence Heights,Lawrence Manor
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
Del Ray,Keelesdale,Mount Dennis,Silverthor

In [60]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [61]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

top 5 most common venues

In [62]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})


----Adelaide,King,Richmond----
----Bathurst Manor,Downsview North,Wilson Heights----
----Bedford Park,Lawrence Manor East----
----Berczy Park----
----Brockton,Exhibition Place,Parkdale Village----
----Cabbagetown,St. James Town----
----Central Bay Street----
----Church and Wellesley----
----Commerce Court,Victoria Hotel----
----Davisville----
----Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West----
----Del Ray,Keelesdale,Mount Dennis,Silverthorn----
----Design Exchange,Toronto Dominion Centre----
----East Birchmount Park,Ionview,Kennedy Park----
----Fairview,Henry Farm,Oriole----
----First Canadian Place,Underground city----
----Flemingdon Park,Don Mills South----
----Harbord,University of Toronto----
----Harbourfront East,Toronto Islands,Union Station----
----Harbourfront,Regent Park----
----Humber Bay Shores,Mimico South,New Toronto----
----Lawrence Heights,Lawrence Manor----
----Leaside----
----Little Portugal,Trinity----
----North Toronto West----
----Northwood Park,Yo

In [63]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [64]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,American Restaurant,Burger Joint,Restaurant,Thai Restaurant,Cosmetics Shop,Hotel
1,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Pharmacy,Bank,Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Restaurant,Diner,Sandwich Place,Shopping Mall
2,"Bedford Park,Lawrence Manor East",Fast Food Restaurant,Italian Restaurant,Coffee Shop,Grocery Store,Ice Cream Shop,Café,Liquor Store,Sandwich Place,Restaurant,Juice Bar
3,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Café,Beer Bar,Steakhouse,Italian Restaurant,Seafood Restaurant,Cheese Shop,Bakery
4,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Breakfast Spot,Yoga Studio,Bar,Performing Arts Venue,Pet Store,Climbing Gym,Restaurant,Caribbean Restaurant


##Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [65]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [66]:
# add clustering labels
if 'Cluster Labels' not in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged = toronto_merged[-toronto_merged["Cluster Labels"].isin(["NaN"])]
toronto_merged.reset_index(drop=True, inplace=True)
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029,3.0,Playground,Coffee Shop,Discount Store,Department Store,Dim Sum Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,1.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Restaurant,Toy / Game Store,Bakery,Tea Room,Metro Station,Kids Store,Japanese Restaurant
2,M2N,North York,Willowdale South,43.77012,-79.408493,0.0,Coffee Shop,Sushi Restaurant,Restaurant,Ramen Restaurant,Sandwich Place,Japanese Restaurant,Café,Pet Store,Shopping Mall,Plaza
3,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923,1.0,Gym,Beer Store,Asian Restaurant,Coffee Shop,Dim Sum Restaurant,Italian Restaurant,Smoke Shop,Sandwich Place,Sporting Goods Shop,Restaurant
4,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.754328,-79.442259,0.0,Coffee Shop,Pharmacy,Bank,Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Restaurant,Diner,Sandwich Place,Shopping Mall


Finally, let's visualize the resulting clusters

In [67]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [68]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North York,0.0,Coffee Shop,Sushi Restaurant,Restaurant,Ramen Restaurant,Sandwich Place,Japanese Restaurant,Café,Pet Store,Shopping Mall,Plaza
4,North York,0.0,Coffee Shop,Pharmacy,Bank,Pizza Place,Deli / Bodega,Middle Eastern Restaurant,Restaurant,Diner,Sandwich Place,Shopping Mall
5,North York,0.0,Massage Studio,Falafel Restaurant,Metro Station,Coffee Shop,Miscellaneous Shop,Caribbean Restaurant,Bar,Women's Store,Dog Run,Farmers Market
9,East Toronto,0.0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Liquor Store,Juice Bar,Bookstore,Spa,Brewery
10,East Toronto,0.0,Café,Coffee Shop,Bakery,Gastropub,American Restaurant,Italian Restaurant,Yoga Studio,Brewery,Seafood Restaurant,Sandwich Place
12,Central Toronto,0.0,Pizza Place,Dessert Shop,Sandwich Place,Restaurant,Thai Restaurant,Café,Italian Restaurant,Sushi Restaurant,Coffee Shop,Deli / Bodega
13,Central Toronto,0.0,Pub,Coffee Shop,Fried Chicken Joint,Liquor Store,Sushi Restaurant,Vietnamese Restaurant,Light Rail Station,American Restaurant,Pizza Place,Supermarket
14,Downtown Toronto,0.0,Coffee Shop,Restaurant,Park,Café,Pub,Italian Restaurant,Bakery,Pizza Place,Deli / Bodega,Playground
15,Downtown Toronto,0.0,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Restaurant,Gay Bar,Men's Store,Pub,Burger Joint,Bubble Tea Shop,Café
16,Downtown Toronto,0.0,Coffee Shop,Bakery,Pub,Park,Theater,Café,Mexican Restaurant,Restaurant,Breakfast Spot,Yoga Studio


In [69]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,1.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Restaurant,Toy / Game Store,Bakery,Tea Room,Metro Station,Kids Store,Japanese Restaurant
3,North York,1.0,Gym,Beer Store,Asian Restaurant,Coffee Shop,Dim Sum Restaurant,Italian Restaurant,Smoke Shop,Sandwich Place,Sporting Goods Shop,Restaurant
7,East York,1.0,Sporting Goods Shop,Coffee Shop,Grocery Store,Furniture / Home Store,Sushi Restaurant,Burger Joint,Pet Store,Sandwich Place,Sports Bar,Breakfast Spot
8,East York,1.0,Indian Restaurant,Yoga Studio,Pharmacy,Pizza Place,Coffee Shop,Discount Store,Sandwich Place,Burger Joint,Liquor Store,Supermarket
11,Central Toronto,1.0,Coffee Shop,Yoga Studio,Bagel Shop,Fast Food Restaurant,Park,Spa,Sporting Goods Shop,Mexican Restaurant,Salon / Barbershop,Diner
30,North York,1.0,Clothing Store,Furniture / Home Store,Women's Store,Vietnamese Restaurant,Boutique,Event Space,Coffee Shop,Fraternity House,Accessories Store,Miscellaneous Shop


In [70]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,York,2.0,Check Cashing Service,Discount Store,Restaurant,Sandwich Place,Women's Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


In [71]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,3.0,Playground,Coffee Shop,Discount Store,Department Store,Dim Sum Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


In [72]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,North York,4.0,Hockey Arena,Portuguese Restaurant,Coffee Shop,Intersection,Dim Sum Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


Refine *k*-means to cluster the neighborhood into 5 clusters. #4

In [73]:
for j in [1,2,3,4]:
    d1 = toronto_merged[toronto_merged["Cluster Labels"].isin([j])]
    d1.reset_index(drop=True, inplace=True)
    for i in range(0,len(d1)):
        df_toronto=df_toronto[-df_toronto['Neighbourhood'].isin([d1['Neighbourhood'][i]])]
df_toronto.reset_index(drop=True, inplace=True)

In [74]:
LIMIT = 100
radius = 500
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
toronto_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))


Upper Rouge
Newtonbrook,Willowdale
Willowdale South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
The Danforth West,Riverdale
Studio District
Davisville
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Bedford Park,Lawrence Manor East
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
Parkdale,Roncesvalles
Runnymede,Swansea
Queen's Park
Humber Bay Shores,Mimico South,New Toronto
Islington Avenue
There are 211 uniques categories.


In [75]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [76]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

top 5 most common venues

In [77]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})


----Adelaide,King,Richmond----
----Bathurst Manor,Downsview North,Wilson Heights----
----Bedford Park,Lawrence Manor East----
----Berczy Park----
----Brockton,Exhibition Place,Parkdale Village----
----Cabbagetown,St. James Town----
----Central Bay Street----
----Church and Wellesley----
----Commerce Court,Victoria Hotel----
----Davisville----
----Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West----
----Design Exchange,Toronto Dominion Centre----
----First Canadian Place,Underground city----
----Harbord,University of Toronto----
----Harbourfront East,Toronto Islands,Union Station----
----Harbourfront,Regent Park----
----Humber Bay Shores,Mimico South,New Toronto----
----Little Portugal,Trinity----
----Northwood Park,York University----
----Parkdale,Roncesvalles----
----Queen's Park----
----Runnymede,Swansea----
----Ryerson,Garden District----
----St. James Town----
----Stn A PO Boxes 25 The Esplanade----
----Studio District----
----The Annex,North Midtown,Yorkville----
----

In [78]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [79]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Café,Coffee Shop,Bar,American Restaurant,Steakhouse,Thai Restaurant,Hotel,Restaurant,Gym,Bakery
1,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Pizza Place,Diner,Bridal Shop,Shopping Mall,Sandwich Place,Middle Eastern Restaurant,Fried Chicken Joint,Supermarket,Sushi Restaurant
2,"Bedford Park,Lawrence Manor East",Fast Food Restaurant,Coffee Shop,Italian Restaurant,Pizza Place,Café,Butcher,Sandwich Place,Liquor Store,Restaurant,Pub
3,Berczy Park,Coffee Shop,Cocktail Bar,Italian Restaurant,Beer Bar,Steakhouse,Seafood Restaurant,Farmers Market,Café,Cheese Shop,Bakery
4,"Brockton,Exhibition Place,Parkdale Village",Coffee Shop,Café,Breakfast Spot,Yoga Studio,Stadium,Gym,Grocery Store,Intersection,Italian Restaurant,Furniture / Home Store


##Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 5 clusters.

In [80]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 0, 3, 3, 3, 1, 3, 3, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [81]:
# add clustering labels
if 'Cluster Labels' not in neighborhoods_venues_sorted:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged = toronto_merged[-toronto_merged["Cluster Labels"].isin(["NaN"])]
toronto_merged.reset_index(drop=True, inplace=True)
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M2N,North York,Willowdale South,43.77012,-79.408493,3.0,Sushi Restaurant,Coffee Shop,Restaurant,Ramen Restaurant,Sandwich Place,Japanese Restaurant,Café,Steakhouse,Plaza,Lounge
1,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.754328,-79.442259,3.0,Coffee Shop,Pizza Place,Diner,Bridal Shop,Shopping Mall,Sandwich Place,Middle Eastern Restaurant,Fried Chicken Joint,Supermarket,Sushi Restaurant
2,M3J,North York,"Northwood Park,York University",43.76798,-79.487262,2.0,Massage Studio,Metro Station,Miscellaneous Shop,Caribbean Restaurant,Falafel Restaurant,Coffee Shop,Bar,Electronics Store,Fish Market,Fish & Chips Shop
3,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0.0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Café,Juice Bar,Bookstore,Spa,Brewery
4,M4M,East Toronto,Studio District,43.659526,-79.340923,3.0,Café,Coffee Shop,Gastropub,American Restaurant,Bakery,Italian Restaurant,Latin American Restaurant,Stationery Store,Bookstore,Middle Eastern Restaurant


Finally, let's visualize the resulting clusters

In [82]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [83]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,East Toronto,0.0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Café,Juice Bar,Bookstore,Spa,Brewery
5,Central Toronto,0.0,Pizza Place,Dessert Shop,Sandwich Place,Restaurant,Thai Restaurant,Italian Restaurant,Coffee Shop,Café,Sushi Restaurant,Flower Shop
18,North York,0.0,Fast Food Restaurant,Coffee Shop,Italian Restaurant,Pizza Place,Café,Butcher,Sandwich Place,Liquor Store,Restaurant,Pub
19,Central Toronto,0.0,Coffee Shop,Sandwich Place,Café,Pizza Place,Pharmacy,BBQ Joint,Cosmetics Shop,Pub,Burger Joint,Liquor Store
28,Etobicoke,0.0,Pizza Place,Bakery,Mexican Restaurant,Seafood Restaurant,Liquor Store,Sandwich Place,Fried Chicken Joint,Restaurant,Fast Food Restaurant,Coffee Shop


In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Downtown Toronto,1.0,Coffee Shop,Bakery,Pub,Park,Café,Restaurant,Mexican Restaurant,Theater,Breakfast Spot,Yoga Studio
13,Downtown Toronto,1.0,Coffee Shop,Café,Italian Restaurant,Burger Joint,Middle Eastern Restaurant,Sandwich Place,Bubble Tea Shop,Japanese Restaurant,Bakery,Sushi Restaurant
27,Queen's Park,1.0,Coffee Shop,Park,Japanese Restaurant,Gym,Sushi Restaurant,Yoga Studio,Seafood Restaurant,Nightclub,Burger Joint,Burrito Place


In [85]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North York,2.0,Massage Studio,Metro Station,Miscellaneous Shop,Caribbean Restaurant,Falafel Restaurant,Coffee Shop,Bar,Electronics Store,Fish Market,Fish & Chips Shop


In [86]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,3.0,Sushi Restaurant,Coffee Shop,Restaurant,Ramen Restaurant,Sandwich Place,Japanese Restaurant,Café,Steakhouse,Plaza,Lounge
1,North York,3.0,Coffee Shop,Pizza Place,Diner,Bridal Shop,Shopping Mall,Sandwich Place,Middle Eastern Restaurant,Fried Chicken Joint,Supermarket,Sushi Restaurant
4,East Toronto,3.0,Café,Coffee Shop,Gastropub,American Restaurant,Bakery,Italian Restaurant,Latin American Restaurant,Stationery Store,Bookstore,Middle Eastern Restaurant
6,Central Toronto,3.0,Pub,Coffee Shop,Pizza Place,Sushi Restaurant,Fried Chicken Joint,Bagel Shop,Sports Bar,Supermarket,Light Rail Station,American Restaurant
7,Downtown Toronto,3.0,Coffee Shop,Restaurant,Pub,Café,Pizza Place,Bakery,Park,Italian Restaurant,Caribbean Restaurant,Liquor Store
8,Downtown Toronto,3.0,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Restaurant,Gastropub,Burger Joint,Pub,Café,Men's Store
10,Downtown Toronto,3.0,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Tea Room,Japanese Restaurant,Italian Restaurant,Diner,Pizza Place
11,Downtown Toronto,3.0,Café,Coffee Shop,Hotel,Restaurant,Clothing Store,Cocktail Bar,Bakery,Breakfast Spot,Cosmetics Shop,Gastropub
12,Downtown Toronto,3.0,Coffee Shop,Cocktail Bar,Italian Restaurant,Beer Bar,Steakhouse,Seafood Restaurant,Farmers Market,Café,Cheese Shop,Bakery
14,Downtown Toronto,3.0,Café,Coffee Shop,Bar,American Restaurant,Steakhouse,Thai Restaurant,Hotel,Restaurant,Gym,Bakery


In [87]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,West Toronto,4.0,Breakfast Spot,Gift Shop,Bar,Movie Theater,Eastern European Restaurant,Coffee Shop,Restaurant,Bank,Bookstore,Dog Run
