In [20]:
import numpy as np
import pandas as pd
import requests
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
import xml

print('Libraries imported.')

Libraries imported.


In [2]:
# append the data into the respective lists
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')
postalCode = []
borough = []
neighborhood = []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCode.append(cells[0].text.rstrip('\n'))
        borough.append(cells[1].text.rstrip('\n'))
        neighborhood.append(cells[2].text.rstrip('\n'))

In [3]:
# create a new DataFrame from the three lists
toronto_df = pd.DataFrame({"PostalCode": postalCode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [4]:
# remove cells with a borough that is Not assigned
toronto_df_rev = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_rev.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [5]:
# group neighborhoods in the same borough
toronto_df_group = toronto_df_rev.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
# if Neighborhood="Not assigned", change the value the same as Borough
for index, row in toronto_df_group.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# print the number of rows of the cleaned dataframe
toronto_df_group.shape

(103, 3)

In [8]:
# load the coordinates from the csv file
coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
# rename the column "PostalCode"
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
# merge two table on the column "PostalCode"
toronto_df_new = toronto_df_group.merge(coordinates, on="PostalCode", how="left")
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [11]:
# create a new test dataframe
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df_new[toronto_df_new["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,Wexford / Maryvale,43.750072,-79.295849
7,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / ...,43.628947,-79.39442


In [12]:
from geopy.geocoders import Nominatim

#get the latitude and longitude values of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto: {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto: 43.6534817, -79.3839347.


In [23]:
!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 7.5MB/s eta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [24]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [25]:
# filter borough names that contain the word Toronto
borough_names = list(toronto_df_new.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [26]:
# create a new DataFrame with only boroughs that contain the word Toronto
toronto_df_new = toronto_df_new[toronto_df_new['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(toronto_df_new.shape)
toronto_df_new.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [27]:

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [31]:
# define Foursquare Credentials and Version
CLIENT_ID = 'MTTB0Z1R5ECZ2Y1JJX4UX4AVCNL151U0OVTX3AF5VWLNXZUX' # your Foursquare ID
CLIENT_SECRET = 'IONPE3NAQKEKWXDVH2IYEEVU3BNROQOTI03EJXTB4NJQ4VAE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MTTB0Z1R5ECZ2Y1JJX4UX4AVCNL151U0OVTX3AF5VWLNXZUX
CLIENT_SECRET:IONPE3NAQKEKWXDVH2IYEEVU3BNROQOTI03EJXTB4NJQ4VAE


In [32]:
#top 100 venues that are within a radius of 1000 meters
radius = 1000
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['PostalCode'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [33]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3194, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Beaches Bake Shop,43.680363,-79.289692,Bakery
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Beech Tree,43.680493,-79.288846,Gastropub
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,Indie Movie Theater


In [34]:
#check how many venues were returned for each PostalCode
venues_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,78,78,78,78,78,78
M4K,East Toronto,The Danforth West / Riverdale,100,100,100,100,100,100
M4L,East Toronto,India Bazaar / The Beaches West,81,81,81,81,81,81
M4M,East Toronto,Studio District,100,100,100,100,100,100
M4N,Central Toronto,Lawrence Park,9,9,9,9,9,9
M4P,Central Toronto,Davisville North,100,100,100,100,100,100
M4R,Central Toronto,North Toronto West,45,45,45,45,45,45
M4S,Central Toronto,Davisville,100,100,100,100,100,100
M4T,Central Toronto,Moore Park / Summerhill East,60,60,60,60,60,60
M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park,80,80,80,80,80,80


In [35]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 269 uniques categories.


In [36]:
venues_df['VenueCategory'].unique()[:50]

array(['Trail', 'Vegetarian / Vegan Restaurant', 'Bakery', 'Gastropub',
       'Indie Movie Theater', 'Ice Cream Shop', 'Bagel Shop',
       'Toy / Game Store', 'French Restaurant', 'Breakfast Spot',
       'Coffee Shop', 'Pub', 'Park', 'Health Food Store', 'Beach',
       'Mexican Restaurant', 'Japanese Restaurant', 'Nail Salon',
       'Juice Bar', 'Bar', 'Cupcake Shop', 'Caribbean Restaurant',
       'Mediterranean Restaurant', 'Liquor Store', 'Tea Room', 'Diner',
       'Indian Restaurant', 'Pharmacy', 'Thai Restaurant',
       'Chocolate Shop', 'Grocery Store', 'Burger Joint',
       'Sandwich Place', 'Ramen Restaurant', 'Greek Restaurant', 'Café',
       'Asian Restaurant', 'Pizza Place', 'Beer Store', 'Bank',
       'Electronics Store', 'Restaurant', 'Mobile Phone Shop',
       'Camera Store', 'Scenic Lookout', 'Shoe Store',
       'Salon / Barbershop', 'Jewelry Store', 'Shopping Mall',
       'Cosmetics Shop'], dtype=object)

In [37]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(3194, 272)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio,Zoo
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_group = toronto_onehot.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

print(toronto_group.shape)
toronto_group

(39, 272)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Accessories Store,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio,Zoo
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,The Danforth West / Riverdale,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0
2,M4L,East Toronto,India Bazaar / The Beaches West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.04,0.0,0.0,0.01,0.0,...,0.0,0.0,0.01,0.0,0.04,0.0,0.01,0.0,0.01,0.0
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,Central Toronto,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.01,0.0,0.01,0.0,0.02,0.0
6,M4R,Central Toronto,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.022222,0.022222,0.0,0.022222,0.0,0.022222,0.0
7,M4S,Central Toronto,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.01,0.0,0.01,0.0,0.01,0.0
8,M4T,Central Toronto,Moore Park / Summerhill East,0.0,0.0,0.016667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.0,0.016667,0.0
9,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,0.0,0.0,0.0125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0125,0.0,0.0125,0.0,0.0125,0.0,0.0125,0.0


In [42]:
#display the top 10 venues for each PostalCode
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_group['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_group['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_group['Neighborhoods']

for ind in np.arange(toronto_group.shape[0]):
    row_categories = toronto_group.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted

(39, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Coffee Shop,Pizza Place,Pub,Bakery,Japanese Restaurant,Beach,Breakfast Spot,Tea Room,Health Food Store,Bar
1,M4K,East Toronto,The Danforth West / Riverdale,Greek Restaurant,Coffee Shop,Café,Pub,Pizza Place,Fast Food Restaurant,Bank,Italian Restaurant,Furniture / Home Store,Spa
2,M4L,East Toronto,India Bazaar / The Beaches West,Indian Restaurant,Coffee Shop,Restaurant,Beach,Café,Fast Food Restaurant,Light Rail Station,Harbor / Marina,Brewery,Sandwich Place
3,M4M,East Toronto,Studio District,Coffee Shop,Bar,Brewery,Café,American Restaurant,Vietnamese Restaurant,Diner,Bakery,Italian Restaurant,French Restaurant
4,M4N,Central Toronto,Lawrence Park,Trail,Park,Pharmacy,College Quad,Gym / Fitness Center,Bookstore,College Gym,Café,Coffee Shop,Donut Shop
5,M4P,Central Toronto,Davisville North,Coffee Shop,Italian Restaurant,Restaurant,Dessert Shop,Café,Pizza Place,Pharmacy,Sushi Restaurant,Gym,Yoga Studio
6,M4R,Central Toronto,North Toronto West,Skating Rink,Italian Restaurant,Coffee Shop,Sporting Goods Shop,Café,Restaurant,Mexican Restaurant,Diner,Park,Pharmacy
7,M4S,Central Toronto,Davisville,Italian Restaurant,Coffee Shop,Sushi Restaurant,Café,Pizza Place,Dessert Shop,Indian Restaurant,Restaurant,Gastropub,Gym
8,M4T,Central Toronto,Moore Park / Summerhill East,Grocery Store,Italian Restaurant,Coffee Shop,Gym,Thai Restaurant,Park,Pub,Pizza Place,Sushi Restaurant,Bagel Shop
9,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,Coffee Shop,Sushi Restaurant,Park,Italian Restaurant,Thai Restaurant,Grocery Store,Liquor Store,Sandwich Place,Café,Restaurant


In [45]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_group['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_group['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_group['Neighborhoods']

for ind in np.arange(toronto_group.shape[0]):
    row_categories = toronto_group.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted

(39, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Coffee Shop,Pizza Place,Pub,Bakery,Japanese Restaurant,Beach,Breakfast Spot,Tea Room,Health Food Store,Bar
1,M4K,East Toronto,The Danforth West / Riverdale,Greek Restaurant,Coffee Shop,Café,Pub,Pizza Place,Fast Food Restaurant,Bank,Italian Restaurant,Furniture / Home Store,Spa
2,M4L,East Toronto,India Bazaar / The Beaches West,Indian Restaurant,Coffee Shop,Restaurant,Beach,Café,Fast Food Restaurant,Light Rail Station,Harbor / Marina,Brewery,Sandwich Place
3,M4M,East Toronto,Studio District,Coffee Shop,Bar,Brewery,Café,American Restaurant,Vietnamese Restaurant,Diner,Bakery,Italian Restaurant,French Restaurant
4,M4N,Central Toronto,Lawrence Park,Trail,Park,Pharmacy,College Quad,Gym / Fitness Center,Bookstore,College Gym,Café,Coffee Shop,Donut Shop
5,M4P,Central Toronto,Davisville North,Coffee Shop,Italian Restaurant,Restaurant,Dessert Shop,Café,Pizza Place,Pharmacy,Sushi Restaurant,Gym,Yoga Studio
6,M4R,Central Toronto,North Toronto West,Skating Rink,Italian Restaurant,Coffee Shop,Sporting Goods Shop,Café,Restaurant,Mexican Restaurant,Diner,Park,Pharmacy
7,M4S,Central Toronto,Davisville,Italian Restaurant,Coffee Shop,Sushi Restaurant,Café,Pizza Place,Dessert Shop,Indian Restaurant,Restaurant,Gastropub,Gym
8,M4T,Central Toronto,Moore Park / Summerhill East,Grocery Store,Italian Restaurant,Coffee Shop,Gym,Thai Restaurant,Park,Pub,Pizza Place,Sushi Restaurant,Bagel Shop
9,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,Coffee Shop,Sushi Restaurant,Park,Italian Restaurant,Thai Restaurant,Grocery Store,Liquor Store,Sandwich Place,Café,Restaurant


In [52]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_group_clustering = toronto_group.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_group_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 0, 0, 3, 1, 1, 1, 1, 1], dtype=int32)

In [53]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = toronto_df_new.copy()

# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(39, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Coffee Shop,Pizza Place,Pub,Bakery,Japanese Restaurant,Beach,Breakfast Spot,Tea Room,Health Food Store,Bar
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Café,Pub,Pizza Place,Fast Food Restaurant,Bank,Italian Restaurant,Furniture / Home Store,Spa
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572,0,Indian Restaurant,Coffee Shop,Restaurant,Beach,Café,Fast Food Restaurant,Light Rail Station,Harbor / Marina,Brewery,Sandwich Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Bar,Brewery,Café,American Restaurant,Vietnamese Restaurant,Diner,Bakery,Italian Restaurant,French Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Trail,Park,Pharmacy,College Quad,Gym / Fitness Center,Bookstore,College Gym,Café,Coffee Shop,Donut Shop


In [54]:
# sort the results by Cluster Labels
print(toronto_merged.shape)
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged

(39, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Café,Pub,Pizza Place,Fast Food Restaurant,Bank,Italian Restaurant,Furniture / Home Store,Spa
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572,0,Indian Restaurant,Coffee Shop,Restaurant,Beach,Café,Fast Food Restaurant,Light Rail Station,Harbor / Marina,Brewery,Sandwich Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Bar,Brewery,Café,American Restaurant,Vietnamese Restaurant,Diner,Bakery,Italian Restaurant,French Restaurant
35,M6R,West Toronto,Parkdale / Roncesvalles,43.64896,-79.456325,0,Café,Coffee Shop,Bar,Sushi Restaurant,Pizza Place,Eastern European Restaurant,Grocery Store,Pub,Restaurant,Breakfast Spot
34,M6P,West Toronto,High Park / The Junction South,43.661608,-79.464763,0,Café,Coffee Shop,Bar,Thai Restaurant,Park,Italian Restaurant,Convenience Store,Sushi Restaurant,Bakery,Antique Shop
33,M6K,West Toronto,Brockton / Parkdale Village / Exhibition Place,43.636847,-79.428191,0,Café,Coffee Shop,Restaurant,Bar,Bakery,Furniture / Home Store,Tibetan Restaurant,Gift Shop,Event Space,Thrift / Vintage Store
32,M6J,West Toronto,Little Portugal / Trinity,43.647927,-79.41975,0,Café,Restaurant,Bar,Vegetarian / Vegan Restaurant,Bakery,Asian Restaurant,Cocktail Bar,Italian Restaurant,Pizza Place,Furniture / Home Store
30,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0,Korean Restaurant,Café,Coffee Shop,Grocery Store,Ice Cream Shop,Cocktail Bar,Mexican Restaurant,Ethiopian Restaurant,Park,Bar
26,M5T,Downtown Toronto,Kensington Market / Chinatown / Grange Park,43.653206,-79.400049,0,Café,Bar,Vegetarian / Vegan Restaurant,Coffee Shop,Yoga Studio,Bakery,Mexican Restaurant,Art Gallery,Gaming Cafe,Caribbean Restaurant
25,M5S,Downtown Toronto,University of Toronto / Harbord,43.662696,-79.400049,0,Café,Bakery,Restaurant,Bar,Coffee Shop,Mexican Restaurant,Bookstore,Vegetarian / Vegan Restaurant,Burrito Place,Beer Bar


In [56]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters