In [1]:
import pandas as pd
import numpy as np

#### PART 1. Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [2]:
#pd.read_html to get the data, the table is the first in the list.
PC_Canada=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
PC_Canada=PC_Canada[0]

In [3]:
PC_Canada.head()  #need to use the 1st row as column name/header

Unnamed: 0,0,1,2
0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [4]:
#convert the first row to header.
new_header = PC_Canada.iloc[0] #grab the first row for the header
new_header
PC_Canada = PC_Canada[1:] #take the data less the header row
PC_Canada.columns = new_header 
PC_Canada.head()
#Notice: More than one neighborhood can exist in one postal code area. Eg: M5A 

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
PC_Canada=PC_Canada[PC_Canada['Borough']!='Not assigned'].reset_index(drop=True)

In [6]:
#if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
PC_Canada.loc[PC_Canada['Neighborhood']=='Not assigned','Neighborhood']=PC_Canada.loc[PC_Canada['Neighborhood']=='Not assigned','Borough']

In [7]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe
PC_Canada.shape

(103, 3)

#### PART 2. get the latitude and the longitude coordinates of each neighborhood.

In [8]:
#this piece of code does not run well, so I hided it.

#import geocoder # import geocoder
#Try the 'M1X' zipcode.
# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format('M1X'))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

#notice: this piece of code does not responde when running. 

In [9]:
#use the csv file provided by the course.
geo_coor=pd.read_csv('http://cocl.us/Geospatial_data')
geo_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
geo_coor.shape  #contains coordinate of all 103 PC.

(103, 3)

In [11]:
#join the PC_Canada and geo_coor
full_df=PC_Canada.join(geo_coor.set_index('Postal Code'), on='Postal Code')
full_df.head() 

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### PART 3. Explore and cluster the neighborhoods in Toronto.

In [12]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [13]:
#Only use boroughs that contain the word Toronto
Toronto_df=full_df[full_df['Borough'].str.contains('Toronto')].reset_index(drop=True)

Toronto_df.shape

(39, 5)

In [14]:
#visualize these 39 areas
Toronto_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [15]:
#Obtain the geographical coordinates of Toronto.

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, postal_code,borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'],Toronto_df['Postal Code'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{},{},{}'.format(postal_code,neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

    #visualize the map 
map_toronto

In [17]:
#start utilizing the Foursquare API to explore the neighborhoods and segment them
#Define Foursquare Credentials and Version

In [19]:
CLIENT_ID = #'deleted for secure purposes' # your Foursquare ID
CLIENT_SECRET = #'deleted for secure purposes' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version


In [20]:
#explore the venues in the neighborhood

#create a function to extract the top 100 venues within 500 meters all the neighborhoods in Toronto


radius=500
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
# use the function created above to pull the venue information of Toronto

toronto_venues = getNearbyVenues(names=Toronto_df['Neighborhood'],
                                   latitudes=Toronto_df['Latitude'],
                                   longitudes=Toronto_df['Longitude']
                                  )


Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

In [22]:
#check toronto_venues df
print(toronto_venues.shape)
toronto_venues.head()

(1622, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


In [23]:
#check the count of venue categories
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",17,17,17,17,17,17
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,65,65,65,65,65,65
Christie,16,16,16,16,16,16
Church and Wellesley,77,77,77,77,77,77
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8


In [24]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 233 uniques categories.


In [25]:
#analysis of the Neighborhoods by using one-hot encoding

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood_name'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.shape


(1622, 234)

In [26]:
#check the head of the toronto_onehot df
toronto_onehot.head()

Unnamed: 0,Neighborhood_name,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#group rows by neighborhood_name and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood_name').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood_name,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.0,0.015385
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.025974
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for current_neighborhood in toronto_grouped['Neighborhood_name']:
    print("----"+current_neighborhood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood_name'] == current_neighborhood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.05
2      Beer Bar  0.03
3          Café  0.03
4   Cheese Shop  0.03


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.12
1  Breakfast Spot  0.08
2     Coffee Shop  0.08
3          Bakery  0.08
4       Pet Store  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0  Gym / Fitness Center  0.06
1               Brewery  0.06
2            Skate Park  0.06
3            Restaurant  0.06
4      Recording Studio  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                 venue  freq
0      Airport Service  0.18
1       Airport Lounge  0.12
2     Airport Terminal  0.12
3  Rental Car Location  0.06
4   Airport Food Court  0.06


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0

In [29]:
#write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [37]:
#create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood_name']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood_name'] = toronto_grouped['Neighborhood_name']

#now, loop each of the 39 areas/neighborhood_names to fill in the top 10 venue cateogires.
for ind in np.arange(toronto_grouped.shape[0]): #here, the toronto_grouped.shape[0]=39
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood_name,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Restaurant,Café,Beer Bar,Cheese Shop,Department Store,Jazz Club
1,"Brockton, Parkdale Village, Exhibition Place",Café,Bakery,Coffee Shop,Breakfast Spot,Pet Store,Nightclub,Performing Arts Venue,Convenience Store,Climbing Gym,Restaurant
2,"Business reply mail Processing Centre, South C...",Yoga Studio,Skate Park,Auto Workshop,Brewery,Burrito Place,Butcher,Comic Shop,Farmers Market,Fast Food Restaurant,Garden
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Boutique,Coffee Shop,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Harbor / Marina
4,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Japanese Restaurant,Café,Thai Restaurant,Department Store,Salad Place,Burger Joint,Bubble Tea Shop


In [38]:
#Run k-means to cluster the neighborhood into 4 clusters.


kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood_name', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 5, 5, 0, 0, 5, 0, 0, 5, 0, 5, 0, 2, 0, 0, 5, 5, 5, 4, 5, 1, 0,
       5, 0, 0, 0, 2, 3, 5, 0, 5, 0, 0, 5, 5, 5, 5, 0, 5])

In [39]:
#check the number of neighborhoods in each cluster 
unique, counts = np.unique(kmeans.labels_, return_counts=True)
dict(zip(unique, counts))


{0: 17, 1: 1, 2: 2, 3: 1, 4: 1, 5: 17}

The results looks a bit interesting: two clusters contain 17 each, other 4 only contain 1 or 2 each.

In [40]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = Toronto_df
#rename the 'Neighborhood' column in the df to 'Neighborhood_name' to perform the join in the next step.
toronto_merged=toronto_merged.rename(columns={'Neighborhood': 'Neighborhood_name'})
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood_name'), on='Neighborhood_name')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood_name,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Yoga Studio,Electronics Store,Restaurant
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Diner,Park,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Bubble Tea Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Cosmetics Shop,Lingerie Store,Fast Food Restaurant,Bookstore
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Cocktail Bar,Restaurant,American Restaurant,Gastropub,Clothing Store,Creperie,Hotel,Gym
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,5,Trail,Neighborhood,Health Food Store,Pub,Dog Run,Dessert Shop,Diner,Discount Store,Distribution Center,Yoga Studio


In [41]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood_name'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

now, lets examine these clusters

In [42]:
#cluster 1

toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] +[2]+ list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood_name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park, Harbourfront",0,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Yoga Studio,Electronics Store,Restaurant
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",0,Coffee Shop,Sushi Restaurant,Diner,Park,Bar,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café
2,Downtown Toronto,"Garden District, Ryerson",0,Clothing Store,Coffee Shop,Bubble Tea Shop,Middle Eastern Restaurant,Café,Japanese Restaurant,Cosmetics Shop,Lingerie Store,Fast Food Restaurant,Bookstore
3,Downtown Toronto,St. James Town,0,Café,Coffee Shop,Cocktail Bar,Restaurant,American Restaurant,Gastropub,Clothing Store,Creperie,Hotel,Gym
5,Downtown Toronto,Berczy Park,0,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Restaurant,Café,Beer Bar,Cheese Shop,Department Store,Jazz Club
6,Downtown Toronto,Central Bay Street,0,Coffee Shop,Italian Restaurant,Sandwich Place,Japanese Restaurant,Café,Thai Restaurant,Department Store,Salad Place,Burger Joint,Bubble Tea Shop
8,Downtown Toronto,"Richmond, Adelaide, King",0,Coffee Shop,Restaurant,Café,Gym,Hotel,Thai Restaurant,Deli / Bodega,Sushi Restaurant,Salad Place,Bookstore
10,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",0,Coffee Shop,Aquarium,Hotel,Café,Scenic Lookout,Sporting Goods Shop,Brewery,Fried Chicken Joint,Italian Restaurant,Restaurant
13,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",0,Coffee Shop,Hotel,Café,Restaurant,Salad Place,American Restaurant,Seafood Restaurant,Japanese Restaurant,Italian Restaurant,Sushi Restaurant
16,Downtown Toronto,"Commerce Court, Victoria Hotel",0,Coffee Shop,Restaurant,Café,Hotel,American Restaurant,Gym,Japanese Restaurant,Italian Restaurant,Seafood Restaurant,Thai Restaurant


In [43]:
#cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] +[2]+ list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood_name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
29,Central Toronto,"Moore Park, Summerhill East",1,Gym,Trail,Department Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant


In [44]:
#cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] +[2]+ list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood_name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
21,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",2,Park,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Department Store,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop
33,Downtown Toronto,Rosedale,2,Park,Playground,Trail,Yoga Studio,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant


In [45]:
#cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] +[2]+ list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood_name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Central Toronto,Roselawn,3,Music Venue,Garden,Deli / Bodega,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run


In [46]:
#cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] +[2]+ list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood_name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto,Lawrence Park,4,Park,Swim School,Bus Line,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop


In [47]:
#cluster 6
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, toronto_merged.columns[[1] +[2]+ list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Neighborhood_name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East Toronto,The Beaches,5,Trail,Neighborhood,Health Food Store,Pub,Dog Run,Dessert Shop,Diner,Discount Store,Distribution Center,Yoga Studio
7,Downtown Toronto,Christie,5,Grocery Store,Café,Park,Baby Store,Nightclub,Italian Restaurant,Diner,Candy Store,Restaurant,Coffee Shop
9,West Toronto,"Dufferin, Dovercourt Village",5,Bakery,Pharmacy,Bank,Supermarket,Bar,Middle Eastern Restaurant,Café,Portuguese Restaurant,Brewery,Park
11,West Toronto,"Little Portugal, Trinity",5,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Asian Restaurant,Men's Store,Restaurant,Café,Record Shop,Beer Store,Japanese Restaurant
12,East Toronto,"The Danforth West, Riverdale",5,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Restaurant,Bookstore,Furniture / Home Store,Yoga Studio,Spa,Indian Restaurant
14,West Toronto,"Brockton, Parkdale Village, Exhibition Place",5,Café,Bakery,Coffee Shop,Breakfast Spot,Pet Store,Nightclub,Performing Arts Venue,Convenience Store,Climbing Gym,Restaurant
15,East Toronto,"India Bazaar, The Beaches West",5,Park,Sandwich Place,Fast Food Restaurant,Pizza Place,Coffee Shop,Pub,Liquor Store,Light Rail Station,Burrito Place,Restaurant
22,West Toronto,"High Park, The Junction South",5,Thai Restaurant,Mexican Restaurant,Café,Discount Store,Italian Restaurant,Bar,Bakery,Speakeasy,Flea Market,Furniture / Home Store
24,Central Toronto,"The Annex, North Midtown, Yorkville",5,Coffee Shop,Café,Sandwich Place,Pizza Place,Indian Restaurant,Pub,Donut Shop,BBQ Joint,History Museum,Flower Shop
25,West Toronto,"Parkdale, Roncesvalles",5,Breakfast Spot,Gift Shop,Cuban Restaurant,Eastern European Restaurant,Dog Run,Bar,Bank,Italian Restaurant,Restaurant,Movie Theater


#### Take-home of these clustering results:

Cluster 1 and cluster 6 contain more neighborhoods respectively. It is very obvious that the cluster 1 contains most of downtown area where these neighborhoods are full of restaurants. Meanwhile, cluster 6 is more of suburb life, where there are restaurants as well as gyms and other stores which don't usually locate in downtown area.
The cluster 2~5 contain only 1/2 neighborhood(s), I think it is because the data itself contains very sparse information, these 1 or 2 neighborhoods in thse clusters must contain very unique (sparse) venue categories, which make them very different when implementing k-mean algorithm.

#### This is the end of this report. Below are some additioanl code which is for me to check the data.

In [48]:
#example: check Moore Park, Summerhill East. This is the only neighborhood in the cluster #2.
toronto_onehot.loc[toronto_onehot['Neighborhood_name']=='Moore Park, Summerhill East']

#it contains only two venues.

Unnamed: 0,Neighborhood_name,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
1189,"Moore Park, Summerhill East",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1190,"Moore Park, Summerhill East",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
#check if the venue category is very 'sparse'
toronto_onehot.drop('Neighborhood_name', 1).sum(axis = 0, skipna = True).sort_values(ascending=True)

Afghan Restaurant                  1
Gas Station                        1
Garden Center                      1
Fruit & Vegetable Store            1
Frozen Yogurt Shop                 1
Portuguese Restaurant              1
Filipino Restaurant                1
Ethiopian Restaurant               1
Doner Restaurant                   1
Snack Place                        1
Coworking Space                    1
Convention Center                  1
College Rec Center                 1
College Gym                        1
College Cafeteria                  1
College Auditorium                 1
College Arts Building              1
Stadium                            1
Stationery Store                   1
Strip Club                         1
Skating Rink                       1
German Restaurant                  1
Skate Park                         1
Women's Store                      1
Plane                              1
Recording Studio                   1
Other Great Outdoors               1
O