# Segmenting and Clustering Neighborhoods in Toronto
## Part1. Collect data from wikipedia and store data into a pandas dataframe

### Load libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import requests
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize
import folium
%matplotlib inline

### Collect data 

In [2]:
# Scrape data from wikipedia page with BeautifulSoup
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tbody = table.find('tbody')
rows = tbody.find_all('tr')
data = []
for row in rows:
    tds = row.find_all('td')
    tds = [td.text.strip() for td in tds]
    data.append([td for td in tds if td])
data

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M

In [3]:
# convert the scraped data above to dataframe and set column names.
df = pd.DataFrame(data, columns = ['PostalCode', 'Borough', 'Neighborhood'])
print('dataset size: ', df.shape)
df.head()

dataset size:  (289, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Explore and Clean Data

In [4]:
# The values of the first row  are all none, we drop rows whose values are all none
df.dropna(axis=0, how='all', inplace=True)
print('dataset size: ', df.shape)
df.head()

dataset size:  (288, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [5]:
# get a subset of data where the value of Borough is not 'Not assigned'
# df.Borough.value_counts()
df = df[df.Borough != 'Not assigned']
df.Borough.value_counts()

Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

In [6]:
# combine rows with same postalcode to one row,seperate Neighborhood by ','.
df = df.groupby(['PostalCode', 'Borough'], sort = False).agg(lambda x: ','.join(x))
df.reset_index(inplace= True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [7]:
# check the lines where the neighborhood value is 'Not assigned'
df[df.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,Not assigned


In [8]:
# where the neighborhood value is 'Not assigned', assign the Borough value to neighborhood value
df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df[df.Neighborhood == 'Not assigned']['Borough']
df[df.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
# check again the size of the new dataset
df.shape

(103, 3)

## Part2. Add coordinates to dataset

In [10]:
# load latitude and longitude dataset
ll_df = pd.read_csv('Geospatial_Coordinates.csv')
ll_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# merge latitude and longitude info to dataset
df = df.merge(ll_df, how = 'left', left_on = 'PostalCode', right_on = 'Postal Code')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


In [12]:
# drop column 'Postal Code'
df.drop('Postal Code', axis = 1, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Part3. Explore and cluster the neighborhoods in Toronto

### Use geopy library to get the latitude and longitude values of Toronto 

In [13]:
address = 'Toronto'
geolocator = Nominatim(user_agent="trt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create a map of Toronto with neighborhoods superimposed on top.

In [14]:
# creat map of Toronto using latitude and longgitude of Toronto
map_trt = folium.Map(location = [latitude, longitude], zoom_start = 10)
# add markers on the map 
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True) 
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_trt)  
map_trt

In [15]:
# check how many neighborhood each borough has
df.Borough.value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

Borough 'North York' has the largest number of neighborhoods, which is 24, we will explore Borough 'North York' using Foursquare API

### Create a map of Borough North York with neighborhoods superimposed on top.

In [16]:
# dataset of Borough North York
df_northy = df[df.Borough=='North York']
df_northy.reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
3,M3B,North York,Don Mills North,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073
5,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923
6,M2H,North York,Hillcrest Village,43.803762,-79.363452
7,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.754328,-79.442259
8,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
9,M3J,North York,"Northwood Park,York University",43.76798,-79.487262


In [17]:
# Use geopy library to get the latitude and longitude values of North York
address = 'North York'

geolocator = Nominatim(user_agent="trt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7708175, -79.4132998.


In [18]:
# create map of North York using latitude and longitude values
map_northy = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_northy['Latitude'], df_northy['Longitude'], df_northy['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_northy) 
map_northy

### Define Foursquare Credentials and Version

In [19]:
CLIENT_ID = '1QLDTWGYGVIST1KCL0TY0ADVUHPVTDIUGLAFXPIWOQZEXNWW' 
CLIENT_SECRET = '2QP3AWK3ACZVGBULSKRPG0HS50ZM4MZ2VXVXQDA44FR1H0FZ' 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1QLDTWGYGVIST1KCL0TY0ADVUHPVTDIUGLAFXPIWOQZEXNWW
CLIENT_SECRET:2QP3AWK3ACZVGBULSKRPG0HS50ZM4MZ2VXVXQDA44FR1H0FZ


### Explore the first neighborhood in Borough 'North York'

In [20]:
# info of the first neighborhood
neighname = df_northy.loc[0, 'Neighborhood']
neighlat = df_northy.loc[0, 'Latitude']
neighlon = df_northy.loc[0, 'Longitude']
print('The first neighborhood is {}, with latitude  and longitude '.format(neighname, neighlat, neighlon))

The first neighborhood is Parkwoods, with latitude  and longitude 


### Get the top 100 venues that are in Parkwoods within a radius of 500 meters.

In [21]:
# creat GET request url
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=1QLDTWGYGVIST1KCL0TY0ADVUHPVTDIUGLAFXPIWOQZEXNWW&client_secret=2QP3AWK3ACZVGBULSKRPG0HS50ZM4MZ2VXVXQDA44FR1H0FZ&ll=43.7708175,-79.4132998&v=20180605&radius=500&limit=100'

In [22]:
# send the GET request and have a look at the result
results = requests.get(url).json()


### Get the result into a pandas dataframe

In [23]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [24]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Captain's Boil,Seafood Restaurant,43.773255,-79.413805
1,Aroma Espresso Bar,Café,43.769449,-79.413081
2,Loblaws,Grocery Store,43.768648,-79.412597
3,Starbucks,Coffee Shop,43.768192,-79.413021
4,Cineplex Cinemas Empress Walk,Movie Theater,43.768625,-79.412613


### Explore Neighborhoods in North York
Create a function to repeat the same process to all the neighborhoods in North York

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
# run the function above to get nearby venues for each neiborhood in North York
northy_venues = getNearbyVenues(names=df_northy['Neighborhood'],
                                latitudes=df_northy['Latitude'],
                                longitudes=df_northy['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Heights,Lawrence Manor
Don Mills North
Glencairn
Flemingdon Park,Don Mills South
Hillcrest Village
Bathurst Manor,Downsview North,Wilson Heights
Fairview,Henry Farm,Oriole
Northwood Park,York University
Bayview Village
CFB Toronto,Downsview East
Silver Hills,York Mills
Downsview West
Downsview,North Park,Upwood Park
Humber Summit
Newtonbrook,Willowdale
Downsview Central
Bedford Park,Lawrence Manor East
Emery,Humberlea
Willowdale South
Downsview Northwest
York Mills West
Willowdale West


In [27]:
# size of the dataset
northy_venues.shape

(238, 7)

In [28]:
# first 5 lines
northy_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,TTC stop #8380,43.752672,-79.326351,Bus Stop
3,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [29]:
# check how many venues are returned for each neighborhood
northy_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor,Downsview North,Wilson Heights",18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4
"Bedford Park,Lawrence Manor East",22,22,22,22,22,22
"CFB Toronto,Downsview East",3,3,3,3,3,3
Don Mills North,5,5,5,5,5,5
Downsview Central,3,3,3,3,3,3
Downsview Northwest,4,4,4,4,4,4
Downsview West,6,6,6,6,6,6
"Downsview,North Park,Upwood Park",4,4,4,4,4,4
"Emery,Humberlea",2,2,2,2,2,2


In [30]:
# find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(northy_venues['Venue Category'].unique())))

There are 107 uniques categories.


### Analyze Each Neighborhood

In [31]:
# one hot encoding
northy_onehot = pd.get_dummies(northy_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
northy_onehot['Neighborhood'] = northy_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [northy_onehot.columns[-1]] + list(northy_onehot.columns[:-1])
northy_onehot = northy_onehot[fixed_columns]

northy_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
northy_grouped = northy_onehot.groupby('Neighborhood').mean().reset_index()
northy_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,...,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park,Lawrence Manor East",0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto,Downsview East",0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Downsview Central,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Downsview Northwest,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Downsview West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Downsview,North Park,Upwood Park",0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Emery,Humberlea",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in northy_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = northy_grouped[northy_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor,Downsview North,Wilson Heights----
         venue  freq
0  Coffee Shop  0.11
1  Supermarket  0.06
2     Pharmacy  0.06
3  Pizza Place  0.06
4        Diner  0.06


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2                 Café  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park,Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.09
1  Italian Restaurant  0.09
2      Sandwich Place  0.05
3      Ice Cream Shop  0.05
4       Grocery Store  0.05


----CFB Toronto,Downsview East----
                   venue  freq
0             Playground  0.33
1                   Park  0.33
2                Airport  0.33
3  Vietnamese Restaurant  0.00
4      Korean Restaurant  0.00


----Don Mills North----
                  venue  freq
0  Gym / Fitness Center   0.2
1  Caribbean Restaurant   0.2
2                  Café   0.2
3        Baseball Field   0.2
4   Japane

In [34]:
#write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
# create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = northy_grouped['Neighborhood']

for ind in np.arange(northy_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northy_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Fried Chicken Joint,Supermarket,Frozen Yogurt Shop,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Restaurant,Diner
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,"Bedford Park,Lawrence Manor East",Italian Restaurant,Coffee Shop,Greek Restaurant,Sandwich Place,Fast Food Restaurant,Grocery Store,Ice Cream Shop,Indian Restaurant,Juice Bar,Liquor Store
3,"CFB Toronto,Downsview East",Airport,Playground,Park,Women's Store,Discount Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
4,Don Mills North,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Japanese Restaurant,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop


### Use K means to Cluster Neighborhoods into 3 clusters

In [36]:
# set up k means clustering
n = 3

northy_grouped_clustering = northy_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=n, random_state=4).fit(northy_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 1, 0, 1, 1, 0, 0, 0, 1], dtype=int32)

In [37]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

northy_merged = df_northy.merge(neighborhoods_venues_sorted, on='Neighborhood')
northy_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Bus Stop,Food & Drink Shop,Fast Food Restaurant,Women's Store,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Portuguese Restaurant,Hockey Arena,Financial or Legal Service,Intersection,Women's Store,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
2,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,1,Furniture / Home Store,Women's Store,Miscellaneous Shop,Arts & Crafts Store,Boutique,Clothing Store,Coffee Shop,Event Space,Gift Shop,Accessories Store
3,M3B,North York,Don Mills North,43.745906,-79.352188,1,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Japanese Restaurant,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop
4,M6B,North York,Glencairn,43.709577,-79.445073,0,Japanese Restaurant,Asian Restaurant,Pub,Park,Fried Chicken Joint,Frozen Yogurt Shop,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store


In [38]:
# Create map of the cluster
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(n)
ys = [i + x + (i*x)**2 for i in range(n)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(northy_merged['Latitude'], northy_merged['Longitude'], northy_merged['Neighborhood'], northy_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

In [39]:
# cluster 1
northy_merged[northy_merged['Cluster Labels'] == 0]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Park,Bus Stop,Food & Drink Shop,Fast Food Restaurant,Women's Store,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
4,M6B,North York,Glencairn,43.709577,-79.445073,0,Japanese Restaurant,Asian Restaurant,Pub,Park,Fried Chicken Joint,Frozen Yogurt Shop,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
10,M2K,North York,Bayview Village,43.786947,-79.385975,0,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Electronics Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
11,M3K,North York,"CFB Toronto,Downsview East",43.737473,-79.464763,0,Airport,Playground,Park,Women's Store,Discount Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
13,M3L,North York,Downsview West,43.739015,-79.506944,0,Grocery Store,Park,Hotel,Bank,Shopping Mall,Dog Run,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
14,M6L,North York,"Downsview,North Park,Upwood Park",43.713756,-79.490074,0,Park,Construction & Landscaping,Bakery,Basketball Court,Women's Store,Electronics Store,Comfort Food Restaurant,Convenience Store,Cosmetics Shop,Deli / Bodega
20,M3N,North York,Downsview Northwest,43.761631,-79.520999,0,Grocery Store,Gym / Fitness Center,Athletics & Sports,Liquor Store,Women's Store,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
21,M2P,North York,York Mills West,43.752758,-79.400049,0,Park,Convenience Store,Bank,Women's Store,Electronics Store,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop,Deli / Bodega,Department Store


In [40]:
# cluster 2
northy_merged[northy_merged['Cluster Labels'] == 1]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Portuguese Restaurant,Hockey Arena,Financial or Legal Service,Intersection,Women's Store,Discount Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
2,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,1,Furniture / Home Store,Women's Store,Miscellaneous Shop,Arts & Crafts Store,Boutique,Clothing Store,Coffee Shop,Event Space,Gift Shop,Accessories Store
3,M3B,North York,Don Mills North,43.745906,-79.352188,1,Caribbean Restaurant,Gym / Fitness Center,Café,Baseball Field,Japanese Restaurant,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop
5,M3C,North York,"Flemingdon Park,Don Mills South",43.7259,-79.340923,1,Gym,Coffee Shop,Beer Store,General Entertainment,Sporting Goods Shop,Chinese Restaurant,Dim Sum Restaurant,Japanese Restaurant,Italian Restaurant,Discount Store
6,M2H,North York,Hillcrest Village,43.803762,-79.363452,1,Golf Course,Dog Run,Pool,Athletics & Sports,Mediterranean Restaurant,Fast Food Restaurant,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
7,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.754328,-79.442259,1,Coffee Shop,Fried Chicken Joint,Supermarket,Frozen Yogurt Shop,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Restaurant,Diner
8,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556,1,Clothing Store,Fast Food Restaurant,Coffee Shop,Tea Room,Japanese Restaurant,Food Court,Electronics Store,Women's Store,Bakery,Toy / Game Store
9,M3J,North York,"Northwood Park,York University",43.76798,-79.487262,1,Coffee Shop,Caribbean Restaurant,Bar,Massage Studio,Dog Run,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
15,M9L,North York,Humber Summit,43.756303,-79.565963,1,Pizza Place,Empanada Restaurant,Shopping Mall,Discount Store,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
16,M3M,North York,Downsview Central,43.728496,-79.495697,1,Korean Restaurant,Food Truck,Baseball Field,Women's Store,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store


In [41]:
# cluster 3
northy_merged[northy_merged['Cluster Labels'] == 2]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714,2,Cafeteria,Women's Store,Dog Run,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store
