Explanation on how to do Webscraping: https://alanhylands.com/how-to-web-scrape-wikipedia-python-urllib-beautiful-soup-pandas/

Import libraries

In [147]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library

import urllib.request

from bs4 import BeautifulSoup


In [148]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

Get HTML-Code of Website and show how it looks like

In [149]:
soup = BeautifulSoup(page, "lxml")

In [150]:
print(soup)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"7d43e78e-4d5c-4147-9a80-2822da42ea48","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1013111980,"wgRevisionId":1013111980,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Wikipedia semi-protected pa

Get the html-code for the specific table I want to use

In [151]:
all_tables=soup.find_all("table")
my_table=all_tables[0]
my_table

<table cellpadding="2" cellspacing="0" rules="all" style="width:100%; border-collapse:collapse; border:1px solid #ccc;">
<tbody><tr>
<td style="width:11%; vertical-align:top; color:#ccc;">
<p><b>M1A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
</p>
</td>
<td style="width:11%; vertical-align:top; color:#ccc;">
<p><b>M2A</b><br/><span style="font-size:85%;"><i>Not assigned</i></span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M3A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M4A</b><br/><span style="font-size:85%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M5A</b><br/><span style="font-size:85%;"><a hr

In [152]:
table_contents=[]

for row in my_table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

print(table_contents)


[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'}, {'PostalCode': 'M4A', 'Borough': 'North York', 'Neighborhood': 'Victoria Village'}, {'PostalCode': 'M5A', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Regent Park, Harbourfront'}, {'PostalCode': 'M6A', 'Borough': 'North York', 'Neighborhood': 'Lawrence Manor, Lawrence Heights'}, {'PostalCode': 'M7A', 'Borough': "Queen's Park", 'Neighborhood': 'Ontario Provincial Government'}, {'PostalCode': 'M9A', 'Borough': 'Etobicoke', 'Neighborhood': 'Islington Avenue'}, {'PostalCode': 'M1B', 'Borough': 'Scarborough', 'Neighborhood': 'Malvern, Rouge'}, {'PostalCode': 'M3B', 'Borough': 'North York', 'Neighborhood': 'Don Mills North'}, {'PostalCode': 'M4B', 'Borough': 'East York', 'Neighborhood': 'Parkview Hill, Woodbine Gardens'}, {'PostalCode': 'M5B', 'Borough': 'Downtown Toronto', 'Neighborhood': 'Garden District, Ryerson'}, {'PostalCode': 'M6B', 'Borough': 'North York', 'Neighborhood': 'Glencairn'}, {'PostalCode': 'M9

In [153]:
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [154]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [155]:
df.shape

(103, 3)

# Get longitude and latitude

Get .csv with coordinates

In [156]:
coordinates=pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")

Add latitude & longitude to dataframe

In [157]:
df["Latitude"]=coordinates["Latitude"]
df["Longitude"]=coordinates["Longitude"]

In [158]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Queen's Park,Ontario Provincial Government,43.773136,-79.239476
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.706876,-79.518188
99,M4Y,Downtown Toronto,Church and Wellesley,43.696319,-79.532242
100,M7Y,East Toronto Business,Enclave of M4L,43.688905,-79.554724
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.739416,-79.588437


# Neighbourhood Clustering

In [159]:
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

! pip install folium==0.5.0
import folium # map rendering library



In [160]:
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values



In [161]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [162]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Foursquare API:

In [163]:
CLIENT_ID = 'WZS531VYLY11XGK2RBKIKCVHKLSGKVRJIAY1SEGNGBFNTSML' # your Foursquare ID
CLIENT_SECRET = '14N01KRGOOUJXI5D0DAGG0S4Q0OUPSX00S5OTKUIDZ1WH2ZO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WZS531VYLY11XGK2RBKIKCVHKLSGKVRJIAY1SEGNGBFNTSML
CLIENT_SECRET:14N01KRGOOUJXI5D0DAGG0S4Q0OUPSX00S5OTKUIDZ1WH2ZO


### Get Top 100 venues in radius of 500m of following neighbourhood:

In [164]:
neighborhood_latitude = df.loc[2, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[2, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[2, 'Neighborhood'] # neighborhood name

neighborhood_name

'Regent Park, Harbourfront'

In [165]:
LIMIT = 10 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL


'https://api.foursquare.com/v2/venues/explore?&client_id=WZS531VYLY11XGK2RBKIKCVHKLSGKVRJIAY1SEGNGBFNTSML&client_secret=14N01KRGOOUJXI5D0DAGG0S4Q0OUPSX00S5OTKUIDZ1WH2ZO&v=20180605&ll=43.7635726,-79.1887115&radius=500&limit=10'

In [166]:
results = requests.get(url).json()

In [167]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [168]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,RBC Royal Bank,Bank,43.76679,-79.191151
1,G & G Electronics,Electronics Store,43.765309,-79.191537
2,Sail Sushi,Restaurant,43.765951,-79.191275
3,Big Bite Burrito,Mexican Restaurant,43.766299,-79.19072
4,Enterprise Rent-A-Car,Rental Car Location,43.764076,-79.193406


In [169]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [170]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills North
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview East
The Danforth

In [171]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Victoria Village,43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
2,Victoria Village,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Regent Park, Harbourfront",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,"Regent Park, Harbourfront",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [172]:
toronto_venues.shape

(687, 7)

In [173]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,10,10,10,10,10,10
"Bathurst Manor, Wilson Heights, Downsview North",10,10,10,10,10,10
Bayview Village,10,10,10,10,10,10
"Bedford Park, Lawrence Manor East",10,10,10,10,10,10
"Birch Cliff, Cliffside West",10,10,10,10,10,10
...,...,...,...,...,...,...
"Willowdale, Newtonbrook",10,10,10,10,10,10
Woburn,10,10,10,10,10,10
Woodbine Heights,3,3,3,3,3,3
York Mills West,10,10,10,10,10,10


### Analyze each neighborhood

In [174]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot["Neighbourhood"] = toronto_venues["Neighborhood"] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head(50)

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Truck Stop,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [175]:
toronto_onehot.shape

(687, 181)

In [176]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Truck Stop,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1
3,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,"Willowdale, Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1


In [177]:
toronto_grouped.shape

(99, 181)

In [178]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                    venue  freq
0             Coffee Shop   0.2
1  Furniture / Home Store   0.1
2                     Bar   0.1
3                  Bakery   0.1
4                    Café   0.1


----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                Coffee Shop   0.2
1                       Bank   0.2
2                Pizza Place   0.1
3                Bridal Shop   0.1
4  Middle Eastern Restaurant   0.1


----Bayview Village----
               venue  freq
0  Indian Restaurant   0.2
1      Grocery Store   0.1
2       Burger Joint   0.1
3    Warehouse Store   0.1
4     Sandwich Place   0.1


----Bedford Park, Lawrence Manor East----
            venue  freq
0     Coffee Shop   0.2
1      Restaurant   0.1
2             Gym   0.1
3  Cosmetics Shop   0.1
4        Creperie   0.1


----Birch Cliff, Cliffside West----
                           venue  freq
0                   Concert Hall   0.1
1  Vegetarian / Vegan Restaura

                 venue  freq
0                 Café   0.2
1                  Pub   0.1
2               Bakery   0.1
3  Japanese Restaurant   0.1
4           Restaurant   0.1


----Leaside----
                        venue  freq
0                        Park  0.33
1  Construction & Landscaping  0.33
2           Convenience Store  0.33
3           Accessories Store  0.00
4  Modern European Restaurant  0.00


----Little Portugal, Trinity----
               venue  freq
0  Health Food Store   0.2
1       Neighborhood   0.2
2                Pub   0.2
3              Trail   0.2
4   Asian Restaurant   0.2


----Malvern, Rouge----
                venue  freq
0         Coffee Shop   0.2
1          Hobby Shop   0.2
2  Chinese Restaurant   0.2
3      Discount Store   0.2
4    Department Store   0.2


----Milliken, Agincourt North, Steeles East, L'Amoreaux East----
                 venue  freq
0          Coffee Shop   0.2
1          Yoga Studio   0.1
2  Distribution Center   0.1
3             Beer 

                 venue  freq
0                 Café   0.2
1        Grocery Store   0.1
2        Shopping Mall   0.1
3            Juice Bar   0.1
4  Japanese Restaurant   0.1


----Woodbine Heights----
               venue  freq
0               Park  0.33
1         Playground  0.33
2       Intersection  0.33
3  Accessories Store  0.00
4  Mobile Phone Shop  0.00


----York Mills West----
              venue  freq
0       Yoga Studio   0.1
1      Dessert Shop   0.1
2          Beer Bar   0.1
3               Bar   0.1
4  Sushi Restaurant   0.1


----York Mills, Silver Hills----
                  venue  freq
0                   Gym  0.11
1                 Hotel  0.11
2  Gym / Fitness Center  0.11
3           Pizza Place  0.11
4      Department Store  0.11




In [187]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [205]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Coffee Shop,Pet Store,Gym,Furniture / Home Store,Bar,Italian Restaurant,Café,Breakfast Spot,Bakery,Dessert Shop
1,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Deli / Bodega,Bridal Shop,Restaurant,Ice Cream Shop,Middle Eastern Restaurant,Pizza Place,Cosmetics Shop,Dessert Shop
2,Bayview Village,Indian Restaurant,Yoga Studio,Sandwich Place,Gas Station,Warehouse Store,Grocery Store,Gym,Pharmacy,Burger Joint,Deli / Bodega
3,"Bedford Park, Lawrence Manor East",Coffee Shop,Gym,Japanese Restaurant,Cosmetics Shop,Creperie,Restaurant,BBQ Joint,Food Truck,Middle Eastern Restaurant,Department Store
4,"Birch Cliff, Cliffside West",Neighborhood,Café,Lounge,Restaurant,Steakhouse,Hotel,Plaza,Seafood Restaurant,Vegetarian / Vegan Restaurant,Concert Hall


## Cluster neighbourhoods

In [206]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 1, 3, 3, 2])

In [207]:
toronto_data=df.drop('PostalCode',1)
toronto_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.806686,-79.194353
1,North York,Victoria Village,43.784535,-79.160497
2,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,Queen's Park,Ontario Provincial Government,43.773136,-79.239476
...,...,...,...,...
98,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.706876,-79.518188
99,Downtown Toronto,Church and Wellesley,43.696319,-79.532242
100,East Toronto Business,Enclave of M4L,43.688905,-79.554724
101,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.739416,-79.588437


In [208]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighborhood', how='right')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
78,Scarborough,Agincourt,43.636847,-79.428191,3,Coffee Shop,Pet Store,Gym,Furniture / Home Store,Bar,Italian Restaurant,Café,Breakfast Spot,Bakery,Dessert Shop
28,North York,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,3,Coffee Shop,Bank,Deli / Bodega,Bridal Shop,Restaurant,Ice Cream Shop,Middle Eastern Restaurant,Pizza Place,Cosmetics Shop,Dessert Shop
39,North York,Bayview Village,43.705369,-79.349372,3,Indian Restaurant,Yoga Studio,Sandwich Place,Gas Station,Warehouse Store,Grocery Store,Gym,Pharmacy,Burger Joint,Deli / Bodega
55,North York,"Bedford Park, Lawrence Manor East",43.651494,-79.375418,3,Coffee Shop,Gym,Japanese Restaurant,Cosmetics Shop,Creperie,Restaurant,BBQ Joint,Food Truck,Middle Eastern Restaurant,Department Store
58,Scarborough,"Birch Cliff, Cliffside West",43.650571,-79.384568,3,Neighborhood,Café,Lounge,Restaurant,Steakhouse,Hotel,Plaza,Seafood Restaurant,Vegetarian / Vegan Restaurant,Concert Hall


In [209]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine clusters

### Cluster 1

In [210]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,Fast Food Restaurant,Yoga Studio,Deli / Bodega,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop
80,"University of Toronto, Harbord",Fast Food Restaurant,Discount Store,Sandwich Place,Yoga Studio,Deli / Bodega,Donut Shop,Dog Run,Distribution Center,Diner,Dessert Shop


### Cluster 2


In [212]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
87,"CN Tower, King and Spadina, Railway Lands, Har...",Auto Workshop,Comic Shop,Brewery,Garden Center,Pizza Place,Farmers Market,Fast Food Restaurant,Restaurant,Skate Park,Burrito Place
82,"Clarks Corners, Tam O'Shanter, Sullivan",Park,Antique Shop,Bar,Café,Italian Restaurant,Arts & Crafts Store,Speakeasy,Flea Market,Furniture / Home Store,Gastropub
79,Davisville,Park,Basketball Court,Construction & Landscaping,Bakery,Department Store,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store
56,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",Farmers Market,Cocktail Bar,French Restaurant,Museum,Restaurant,Liquor Store,Beer Bar,Park,Vegetarian / Vegan Restaurant,Fruit & Vegetable Store
7,Don Mills North,Bus Line,Bakery,Park,Soccer Field,Intersection,Metro Station,Ice Cream Shop,Bus Station,Cuban Restaurant,Curling Ice
13,Don Mills South,Thai Restaurant,Noodle House,Gas Station,Pizza Place,Chinese Restaurant,Fried Chicken Joint,Fast Food Restaurant,Italian Restaurant,Bank,Intersection
31,"Dufferin, Dovercourt Village",Grocery Store,Shopping Mall,Park,Bank,Hotel,Deli / Bodega,Donut Shop,Dog Run,Distribution Center,Discount Store
100,Enclave of M4L,Sandwich Place,Mobile Phone Shop,Park,Bus Line,Yoga Studio,Dance Studio,Dog Run,Distribution Center,Discount Store,Diner
92,Enclave of M5E,Supplement Shop,Wings Joint,Sandwich Place,Grocery Store,Convenience Store,Fast Food Restaurant,Hardware Store,Discount Store,Bakery,Burger Joint
17,"Eringate, Bloordale Gardens, Old Burnhamthorpe...",Golf Course,Dog Run,Fast Food Restaurant,Mediterranean Restaurant,Pool,Dance Studio,Distribution Center,Discount Store,Diner,Dessert Shop


### Cluster 3

In [213]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,Christie,Park,Construction & Landscaping,Food & Drink Shop,Yoga Studio,Deli / Bodega,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner
48,"Commerce Court, Victoria Hotel",Park,Yoga Studio,Dance Studio,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
40,Downsview East,Park,Convenience Store,Metro Station,Deli / Bodega,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop
50,Humber Summit,Park,Playground,Trail,Curling Ice,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop
23,Leaside,Park,Construction & Landscaping,Convenience Store,Yoga Studio,Deli / Bodega,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner
30,"Richmond, Adelaide, King",Park,Airport,Dance Studio,Drugstore,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop
74,"The Annex, North Midtown, Yorkville",Park,Pool,Women's Store,Gay Bar,Cuban Restaurant,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
98,"The Kingsway, Montgomery Road, Old Mill North",Park,Convenience Store,Yoga Studio,Dance Studio,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


### Cluster 4

In [214]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
78,Agincourt,Coffee Shop,Pet Store,Gym,Furniture / Home Store,Bar,Italian Restaurant,Café,Breakfast Spot,Bakery,Dessert Shop
28,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Deli / Bodega,Bridal Shop,Restaurant,Ice Cream Shop,Middle Eastern Restaurant,Pizza Place,Cosmetics Shop,Dessert Shop
39,Bayview Village,Indian Restaurant,Yoga Studio,Sandwich Place,Gas Station,Warehouse Store,Grocery Store,Gym,Pharmacy,Burger Joint,Deli / Bodega
55,"Bedford Park, Lawrence Manor East",Coffee Shop,Gym,Japanese Restaurant,Cosmetics Shop,Creperie,Restaurant,BBQ Joint,Food Truck,Middle Eastern Restaurant,Department Store
58,"Birch Cliff, Cliffside West",Neighborhood,Café,Lounge,Restaurant,Steakhouse,Hotel,Plaza,Seafood Restaurant,Vegetarian / Vegan Restaurant,Concert Hall
43,"Brockton, Parkdale Village, Exhibition Place",Coffee Shop,Ice Cream Shop,Seafood Restaurant,Bakery,Fish Market,Bookstore,Pet Store,Gay Bar,Neighborhood,Comic Shop
26,Cedarbrae,Gym,Caribbean Restaurant,Japanese Restaurant,Café,Yoga Studio,Department Store,Drugstore,Donut Shop,Dog Run,Distribution Center
24,Central Bay Street,Coffee Shop,Pharmacy,Pizza Place,Butcher,Supermarket,Dance Studio,Dog Run,Distribution Center,Discount Store,Diner
99,Church and Wellesley,Pizza Place,Coffee Shop,Chinese Restaurant,Discount Store,Playground,Sandwich Place,Intersection,Dance Studio,Dog Run,Distribution Center
51,"Cliffside, Cliffcrest, Scarborough Village West",Café,Jewelry Store,Japanese Restaurant,Italian Restaurant,Diner,Butcher,Bakery,Indian Restaurant,Restaurant,Yoga Studio


### Cluster 5

In [215]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
94,"Clairville, Humberwood, Woodbine Downs, West H...",Print Shop,Yoga Studio,Dance Studio,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store
