# Segementing and Clustering Neighbourhood Data in Toronto

## Part 1:   getting the postal codes off the wikipedia page

In [1]:
import urllib.request
import requests

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Get the URL
website_url = requests.get(url).text

In [2]:
from bs4 import BeautifulSoup

#Print out the HTML in a readable format
soup = BeautifulSoup(website_url, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":920980179,"wgRevisionId":920980179,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNames

In [3]:
#Find the table elements
My_table = soup.find('table', {'class': 'wikitable sortable'})
My_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [4]:
import pandas as pd #Import Pandas

#Create a blank data frame to put wikipedia table into
data = pd.DataFrame(columns=['Postcode', 'Borough', 'Neighbourhood'])
data

Unnamed: 0,Postcode,Borough,Neighbourhood


In [5]:
#Get the elements in the table
info = My_table.findAll('td')
info

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights

In [6]:
#This block turns the table into a data frame
row = []
count = 0
for i in info:
    row.append(i.get_text().rstrip())
    count = count + 1
    if count%3 == 0 and count != 0:
        data.loc[len(data)] = row
        row = []

data

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1A,Not assigned,Not assigned
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
M1E,Scarborough,"West Hill,Guildwood,Morningside"
M1G,Scarborough,Woburn
...,...,...
M9V,Etobicoke,"Jamestown,Humbergate,Thistletown,Albion Garden..."
M9W,Etobicoke,Northwest
M9X,Not assigned,Not assigned
M9Y,Not assigned,Not assigned


In [7]:
#Drop entries with no assigned borough
data.drop(data[data['Borough'] =="Not assigned"].index, axis =0, inplace = True)
data.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [8]:
#assign borough name to the entries with no neighbourhood lable
data.loc[data['Neighbourhood'] =='Not assigned', 'Neighbourhood'] = data.loc[data['Neighbourhood']=="Not assigned",'Borough']

# Group neighbourhoods with similar postcodes
data=data.groupby("Postcode").agg(lambda x:','.join(set(x)))

data.shape

(103, 2)

## Part 2, get the longitude and lattitude of each postal code

In [8]:
import geocoder #We use to get longitude and latitude


def get_lat_lon(postcode):
    
    #initialize varaible to none
    lat_lng_coords = None
    
    #loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        lat_lng_coords = g.latlng
        
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude
    

In [9]:
url = 'https://cocl.us/Geospatial_data'
lon_lat_data = pd.read_csv(url)
lon_lat_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
#Make a data frame which merges all information into one data frame
df = pd.merge(data, lon_lat_data,
              left_on ='Postcode',
             right_on = 'Postal Code',
             #how = 'left'
             )
df.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,Scarborough,"Malvern,Rouge",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,Scarborough,"Morningside,West Hill,Guildwood",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


## Part 3, run K-clustering on the neighborhoods of toronto

In [11]:
# For vectorized data
import numpy as np

#For handling JSON files
import json

#Transform JSON files into pandas data frames
from pandas.io.json import json_normalize

#Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#k-means from clustering 
from sklearn.cluster import KMeans

import folium

print("Libraries imported")

Libraries imported


In [17]:
#Create a map of Toronto using lattitude and longitue values
latitude = 43.6532
longitude = -79.3832

map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)

#add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label ='{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color ='#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)
    
map_toronto.save('toronto.html')

#### Define Foursquare Credentials and Version

In [26]:
#Definte foursquare credentials and version

CLIENT_ID = 'ARJBUSFHFC0AXGZAHDGMV1G00DCDTMFJQIKRP4PTKQTHAT3L'
CLIENT_SECRET = 'DA3PIG23EJPASMTEW41PTR2Q3Q0G4AVQ1TNEUPJRLRD0RBH4'
VERSION = '20180605'
LIMIT = 100

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET: ' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: ARJBUSFHFC0AXGZAHDGMV1G00DCDTMFJQIKRP4PTKQTHAT3L
CLIENT_SECRET: DA3PIG23EJPASMTEW41PTR2Q3Q0G4AVQ1TNEUPJRLRD0RBH4


We will definite a function to get the category of a venue

In [20]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### We create a function to process neighbourhoods in Manhattan

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                'Neighborhood Latitude', 
                'Neighborhood Longitude', 
                'Venue', 
                'Venue Latitude', 
                'Venue Longitude', 
                'Venue Category']
    
    return(nearby_venues)

#### Now we run the above function on our neighbourhoods.

In [27]:
toronto_venues = getNearbyVenues(names= df['Neighbourhood'],
                                   latitudes= df['Latitude'],
                                   longitudes= df['Longitude']
                                  )

Malvern,Rouge
Highland Creek,Rouge Hill,Port Union
Morningside,West Hill,Guildwood
Woburn
Cedarbrae
Scarborough Village
Kennedy Park,Ionview,East Birchmount Park
Oakridge,Golden Mile,Clairlea
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Scarborough Town Centre,Wexford Heights,Dorset Park
Maryvale,Wexford
Agincourt
Sullivan,Tam O'Shanter,Clarks Corners
L'Amoreaux East,Milliken,Steeles East,Agincourt North
L'Amoreaux West
Upper Rouge
Hillcrest Village
Henry Farm,Oriole,Fairview
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South,Flemingdon Park
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
Downsview East,CFB Toronto
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West,Riverdale
India Bazaar,The Beac

We check the size of the resulting data frame

In [28]:
print(toronto_venues.shape)
toronto_venues.head()

(2268, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern,Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Morningside,West Hill,Guildwood",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Morningside,West Hill,Guildwood",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Morningside,West Hill,Guildwood",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


In [29]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,Richmond,King",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Bathurst Manor,Downsview North,Wilson Heights",19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
Berczy Park,57,57,57,57,57,57
...,...,...,...,...,...,...
Woburn,3,3,3,3,3,3
"Woodbine Gardens,Parkview Hill",12,12,12,12,12,12
Woodbine Heights,9,9,9,9,9,9
York Mills West,3,3,3,3,3,3


Let's determine the unique categories that can be found from all the returned venues

In [31]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 277 unique categories.


### Analyze each neighborhood

In [61]:
#One hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix ="", prefix_sep ="")

#Add neighborhood column back to the dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

#Move neighbourhood column to the first column
toronto_onehot = toronto_onehot.reindex(columns = ['Neighborhood'] + list([a for a in toronto_onehot.columns if a != 'Neighborhood' ]))


toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek,Rouge Hill,Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Morningside,West Hill,Guildwood",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Morningside,West Hill,Guildwood",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Morningside,West Hill,Guildwood",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek,Rouge Hill,Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Morningside,West Hill,Guildwood",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Morningside,West Hill,Guildwood",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Morningside,West Hill,Guildwood",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
toronto_onehot.shape

(2268, 277)

Now let's group rows by neighborhood and by taking the mean of the requency of occurence of each category

In [64]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,Richmond,King",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030000,...,0.0,0.010000,0.0,0.000000,0.0,0.0,0.01,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00,0.0
2,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.052632,0.0,0.0,0.00,0.0,0.00,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00,0.0
4,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.017544,0.0,0.000000,0.0,0.0,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00,0.0
95,"Woodbine Gardens,Parkview Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00,0.0
96,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.111111,0.0,0.0,0.00,0.0,0.00,0.0
97,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.00,0.0,0.00,0.0


Confirm new size

In [65]:
toronto_grouped.shape

(99, 277)

Print each neighborhood along with the top five most common venues

In [67]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print('----'+hood+'----')
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

----Adelaide,Richmond,King----
             venue  freq
0      Coffee Shop  0.07
1             Café  0.05
2       Steakhouse  0.04
3              Bar  0.04
4  Thai Restaurant  0.04


----Agincourt----
               venue  freq
0       Skating Rink  0.25
1             Lounge  0.25
2     Breakfast Spot  0.25
3     Clothing Store  0.25
4  Accessories Store  0.00


----Bathurst Manor,Downsview North,Wilson Heights----
                venue  freq
0         Coffee Shop  0.11
1       Deli / Bodega  0.05
2  Frozen Yogurt Shop  0.05
3                Bank  0.05
4         Bridal Shop  0.05


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3   Chinese Restaurant  0.25
4          Music Venue  0.00


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2         Cheese Shop  0.04
3  Seafood Restaurant  0.04
4            Beer Bar  0.04


----Birch Cliff,Cliff

                  venue  freq
0  Fast Food Restaurant   1.0
1     Accessories Store   0.0
2    Miscellaneous Shop   0.0
3         Movie Theater   0.0
4                 Motel   0.0


----Maryvale,Wexford----
                   venue  freq
0      Accessories Store  0.12
1  Vietnamese Restaurant  0.12
2            Auto Garage  0.12
3          Shopping Mall  0.12
4         Sandwich Place  0.12


----Montgomery Road,Old Mill North,The Kingsway----
                       venue  freq
0                 Smoke Shop  0.33
1                      River  0.33
2                       Park  0.33
3  Middle Eastern Restaurant  0.00
4        Monument / Landmark  0.00


----Morningside,West Hill,Guildwood----
                 venue  freq
0          Pizza Place  0.12
1  Rental Car Location  0.12
2       Medical Center  0.12
3       Breakfast Spot  0.12
4   Mexican Restaurant  0.12


----New Toronto,Humber Bay Shores,Mimico South----
                  venue  freq
0                  Café  0.12
1           Pi

### Let's put our above data into a pandas data frame

Lets write a functino to sort venues in descending order

In [72]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [88]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

#create a column according to the number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
#create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Thai Restaurant,Steakhouse,Bar,Asian Restaurant,Hotel,Sushi Restaurant,Burger Joint,American Restaurant
1,Agincourt,Lounge,Breakfast Spot,Clothing Store,Skating Rink,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
2,"Bathurst Manor,Downsview North,Wilson Heights",Coffee Shop,Diner,Sushi Restaurant,Restaurant,Supermarket,Middle Eastern Restaurant,Deli / Bodega,Fried Chicken Joint,Frozen Yogurt Shop,Fast Food Restaurant
3,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,Berczy Park,Coffee Shop,Cocktail Bar,Café,Seafood Restaurant,Farmers Market,Cheese Shop,Steakhouse,Bakery,Beer Bar,Indian Restaurant


### Cluster the neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [83]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

#run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)

#Check cluster labeles generated for each row in the dataframe
kmeans.labels_[0:10]

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [89]:
#add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [91]:
toronto_merged = df

#Merged toronto grouped with toronto data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head()

Unnamed: 0,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,"Malvern,Rouge",M1B,43.806686,-79.194353,1.0,Fast Food Restaurant,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
1,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497,1.0,Bar,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Dessert Shop
2,Scarborough,"Morningside,West Hill,Guildwood",M1E,43.763573,-79.188711,1.0,Pizza Place,Intersection,Rental Car Location,Mexican Restaurant,Medical Center,Breakfast Spot,Electronics Store,Spa,General Travel,General Entertainment
3,Scarborough,Woburn,M1G,43.770992,-79.216917,1.0,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,1.0,Fried Chicken Joint,Bakery,Hakka Restaurant,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Dog Run,Dim Sum Restaurant,Diner


In [114]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    try:
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
    except:
        continue
       
map_clusters.save('toronto_Clusters.html')
map_clusters

## Let's explore the clusters

### Cluster 1

In [116]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1] ))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,Parkwoods,0.0,BBQ Joint,Food & Drink Shop,Park,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Yoga Studio
30,"Downsview East,CFB Toronto",0.0,Airport,Park,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
44,Lawrence Park,0.0,Bus Line,Park,Swim School,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
50,Rosedale,0.0,Park,Trail,Playground,Building,Yoga Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
73,Humewood-Cedarvale,0.0,Trail,Field,Park,Hockey Arena,Yoga Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
74,Caledonia-Fairbanks,0.0,Park,Women's Store,Fast Food Restaurant,Market,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
79,"Downsview,North Park,Upwood Park",0.0,Basketball Court,Bakery,Park,Construction & Landscaping,Yoga Studio,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
90,"Montgomery Road,Old Mill North,The Kingsway",0.0,River,Park,Smoke Shop,Yoga Studio,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
91,"Humber Bay,King's Mill Park,Sunnylea,The Queen...",0.0,Business Service,Park,Baseball Field,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Yoga Studio
100,"Kingsview Village,St. Phillips,Martin Grove Ga...",0.0,Pizza Place,Bus Line,Park,Mobile Phone Shop,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Yoga Studio


### Cluster 2

In [117]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1] ))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Malvern,Rouge",1.0,Fast Food Restaurant,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
1,"Highland Creek,Rouge Hill,Port Union",1.0,Bar,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Dessert Shop
2,"Morningside,West Hill,Guildwood",1.0,Pizza Place,Intersection,Rental Car Location,Mexican Restaurant,Medical Center,Breakfast Spot,Electronics Store,Spa,General Travel,General Entertainment
3,Woburn,1.0,Coffee Shop,Korean Restaurant,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,Cedarbrae,1.0,Fried Chicken Joint,Bakery,Hakka Restaurant,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Dog Run,Dim Sum Restaurant,Diner
...,...,...,...,...,...,...,...,...,...,...,...,...
96,Humber Summit,1.0,Pizza Place,Empanada Restaurant,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
97,"Humberlea,Emery",1.0,Baseball Field,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Dessert Shop
99,Westmount,1.0,Pizza Place,Chinese Restaurant,Sandwich Place,Intersection,Coffee Shop,Discount Store,Middle Eastern Restaurant,Dim Sum Restaurant,Diner,Dog Run
101,"Jamestown,Albion Gardens,Mount Olive,Thistleto...",1.0,Pizza Place,Grocery Store,Fast Food Restaurant,Sandwich Place,Beer Store,Fried Chicken Joint,Pharmacy,Dumpling Restaurant,Eastern European Restaurant,Drugstore


### Cluster 3

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1] ))]]

### Cluster 4

In [119]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1] ))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Scarborough Village,3.0,Playground,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop,Falafel Restaurant
14,"L'Amoreaux East,Milliken,Steeles East,Agincour...",3.0,Playground,Park,Yoga Studio,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant


### Cluster 5

In [120]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1] ))]]

Unnamed: 0,Neighbourhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,"Newtonbrook,Willowdale",4.0,Piano Bar,Yoga Studio,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
