### Web Scriping 

In [58]:
# Import packages
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
import urllib.request
from bs4 import BeautifulSoup # scripping web

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import geocoder
from sklearn.cluster import KMeans

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

import matplotlib.cm as cm
import matplotlib.colors as colors
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import folium # plotting library

In [59]:
# scripping wiki page
url = 'https://en.wikipedia.org/wiki/Special_wards_of_Tokyo'
page = urllib.request.urlopen(url)

In [60]:
# parse html from url 
soup = BeautifulSoup(page, 'lxml')
# find the table class
table = soup.find('table', class_='wikitable sortable')

In [82]:
# initiate empty list for store information
boroughs = []
for row in table.findAll('td'):
    # find content and append to the list
    try:
        borough = row.select_one('a').text
        boroughs.append(borough)
    except:
        continue

In [86]:
boroughs_clean = [i for i in boroughs if i]
boroughs_clean

['Chiyoda',
 'Nagatachō',
 'Chūō',
 'Nihonbashi',
 'Minato',
 'Odaiba',
 'Shinjuku',
 'Shinjuku',
 'Bunkyō',
 'Hongō',
 'Taitō',
 'Ueno',
 'Sumida',
 'Kinshichō',
 'Kōtō',
 'Kiba',
 'Shinagawa',
 'Shinagawa',
 'Meguro',
 'Meguro',
 'Ōta',
 'Ōmori',
 'Setagaya',
 'Setagaya',
 'Shibuya',
 'Shibuya',
 'Nakano',
 'Nakano',
 'Suginami',
 'Kōenji',
 'Toshima',
 'Ikebukuro',
 'Kita',
 'Akabane',
 'Arakawa',
 'Nippori',
 'Itabashi',
 'Itabashi',
 'Nerima',
 'Nerima',
 'Adachi',
 'Kitasenju',
 'Katsushika',
 'Tateishi',
 'Edogawa']

In [124]:
# create empty dataframe and append data into dataframe 
df = pd.DataFrame(columns=['Borough', 'City', 'Latitude', 'Longitude'])
df['Borough'] = boroughs_clean
df['City'] = 'Tokyo'
df.head()

Unnamed: 0,Borough,City,Latitude,Longitude
0,Chiyoda,Tokyo,,
1,Nagatachō,Tokyo,,
2,Chūō,Tokyo,,
3,Nihonbashi,Tokyo,,
4,Minato,Tokyo,,


Seems like wiki has fixed these problems by itself!

### Coordinates matching

In [125]:
def get_coord(x):
    address = '{},{}'.format(x[0], x[1])

    geolocator = Nominatim(user_agent="Tokyo_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

In [126]:
df['coords'] = df.apply(get_coord, axis=1)

In [127]:
df.head()

Unnamed: 0,Borough,City,Latitude,Longitude,coords
0,Chiyoda,Tokyo,,,"(35.6777691, 139.7646365)"
1,Nagatachō,Tokyo,,,"(35.675618, 139.7434685)"
2,Chūō,Tokyo,,,"(35.666255, 139.775565)"
3,Nihonbashi,Tokyo,,,"(35.68406775, 139.77450291683806)"
4,Minato,Tokyo,,,"(35.6432274, 139.7400553)"


In [130]:
for i in range(df.shape[0]):
    df['Latitude'][i] = df.coords[i][0]
    df['Longitude'][i] = df.coords[i][1]

In [133]:
df.drop('coords', axis=1, inplace=True)

In [204]:
df.head()

Unnamed: 0,Borough,City,Latitude,Longitude
0,Chiyoda,Tokyo,35.6778,139.765
1,Nagatachō,Tokyo,35.6756,139.743
2,Chūō,Tokyo,35.6663,139.776
3,Nihonbashi,Tokyo,35.6841,139.775
4,Minato,Tokyo,35.6432,139.74


### Clustering Analysis

In [135]:
import getpass

In [146]:
CLIENT_ID = getpass.getpass()

········


In [147]:
CLIENT_SECRET = getpass.getpass()

········


In [148]:
VERSION = '20200601' # Foursquare API version

In [149]:
# print out all unique boroughs
df.Borough.unique()

array(['Chiyoda', 'Nagatachō', 'Chūō', 'Nihonbashi', 'Minato', 'Odaiba',
       'Shinjuku', 'Bunkyō', 'Hongō', 'Taitō', 'Ueno', 'Sumida',
       'Kinshichō', 'Kōtō', 'Kiba', 'Shinagawa', 'Meguro', 'Ōta', 'Ōmori',
       'Setagaya', 'Shibuya', 'Nakano', 'Suginami', 'Kōenji', 'Toshima',
       'Ikebukuro', 'Kita', 'Akabane', 'Arakawa', 'Nippori', 'Itabashi',
       'Nerima', 'Adachi', 'Kitasenju', 'Katsushika', 'Tateishi',
       'Edogawa'], dtype=object)

Some of the boroughs contains multiple zipcodes, some of the neighborhoods overlaps with each other (different postal code share same neighborhoods). Therefore, I decide to use the postal code to analyze the cluster.

In [150]:
# get top 100 vanues in M3A within a radius of 500 meters
Radius = 1000
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    df.iloc[0, 2], 
    df.iloc[0, 3], 
    Radius, 
    LIMIT)

In [152]:
# request foursquare API and get the results
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ee81cb5c6a68d348c16f091'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Chūō',
  'headerFullLocation': 'Chūō, Tokyo',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 246,
  'suggestedBounds': {'ne': {'lat': 35.68676910900001,
    'lng': 139.77569533993514},
   'sw': {'lat': 35.66876909099999, 'lng': 139.75357766006485}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '535ac87611d2d5f9d8bc1606',
       'name': 'Keiyō Line Tōkyō Station (京葉線 東京駅)',
       'location': {'address': '丸の内1-9-1',
        'crossStreet': '京葉線/武蔵野線',
        'lat': 35.67772716044116,
        'lng': 139.76484775543213,
        'labeledLatLngs': [{'label

In [153]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [154]:
results

{'meta': {'code': 200, 'requestId': '5ee81cb5c6a68d348c16f091'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Chūō',
  'headerFullLocation': 'Chūō, Tokyo',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 246,
  'suggestedBounds': {'ne': {'lat': 35.68676910900001,
    'lng': 139.77569533993514},
   'sw': {'lat': 35.66876909099999, 'lng': 139.75357766006485}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '535ac87611d2d5f9d8bc1606',
       'name': 'Keiyō Line Tōkyō Station (京葉線 東京駅)',
       'location': {'address': '丸の内1-9-1',
        'crossStreet': '京葉線/武蔵野線',
        'lat': 35.67772716044116,
        'lng': 139.76484775543213,
        'labeledLatLngs': [{'label

In [161]:
# get the result into a panda dataframe
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Keiyō Line Tōkyō Station (京葉線 東京駅),Train Station,35.677727,139.764848
1,Mitsubishi Ichigokan Museum (三菱一号館美術館),Art Museum,35.67842,139.76326
2,KITTE Garden (屋上庭園 KITTEガーデン),Roof Deck,35.679654,139.765169
3,Indian Curry (インデアンカレー),Japanese Curry Restaurant,35.678395,139.765008
4,VIRON,Café,35.678635,139.765147


We next will write a code to run through all the postal code.

In [176]:
def getNearbyVenues(postal, latitudes, longitudes, radius=1000, LIMIT=100):
    
    venues_list=[]
    for postal, lat, lng in zip(postal, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        
            # return only relevant information for each nearby venue
            venues_list.append([(
                postal, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

            nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
            nearby_venues.columns = ['Borough', 
                          'Neighborhood Latitude', 
                          'Neighborhood Longitude', 
                          'Venue', 
                          'Venue Latitude', 
                          'Venue Longitude', 
                          'Venue Category']
        except KeyError:
            continue
    
    return(nearby_venues)

In [177]:
# run through the df_merge get neighborhoods for all postal codes
# type your answer here
tokyo_venues = getNearbyVenues(postal=df['Borough'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

In [178]:
# check the size of the dataframe
print(tokyo_venues.shape)
tokyo_venues.head()

(4062, 7)


Unnamed: 0,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Chiyoda,35.677769,139.764636,Keiyō Line Tōkyō Station (京葉線 東京駅),35.677727,139.764848,Train Station
1,Chiyoda,35.677769,139.764636,Mitsubishi Ichigokan Museum (三菱一号館美術館),35.67842,139.76326,Art Museum
2,Chiyoda,35.677769,139.764636,KITTE Garden (屋上庭園 KITTEガーデン),35.679654,139.765169,Roof Deck
3,Chiyoda,35.677769,139.764636,Indian Curry (インデアンカレー),35.678395,139.765008,Japanese Curry Restaurant
4,Chiyoda,35.677769,139.764636,VIRON,35.678635,139.765147,Café


In [179]:
# check how many vanues by each postal code
tokyo_venues.groupby('Borough')['Venue'].count()

Borough
Akabane       100
Arakawa        96
Bunkyō        100
Chiyoda       100
Chūō          100
Edogawa        38
Hongō         100
Ikebukuro     100
Itabashi      146
Katsushika     68
Kiba          100
Kinshichō     100
Kita          100
Kitasenju     100
Kōenji        100
Kōtō           57
Meguro        200
Minato        100
Nagatachō     100
Nakano        200
Nerima        160
Nihonbashi    100
Nippori       100
Odaiba        100
Setagaya      200
Shibuya       200
Shinagawa     200
Shinjuku      100
Suginami      100
Sumida        100
Taitō         100
Tateishi       97
Toshima       100
Ueno          100
Ōmori         100
Ōta           100
Name: Venue, dtype: int64

In [182]:
# check unique categories of all venues
print('There are {} uniques categories.'.format(len(tokyo_venues['Venue Category'].unique())))

There are 268 uniques categories.


Next we want to use onehot encoding to analyze venue categories for each postal code.

In [185]:
# one hot encoding
tokyo_onehot = pd.get_dummies(tokyo_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tokyo_onehot['Borough'] = tokyo_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [tokyo_onehot.columns[-1]] + list(tokyo_onehot.columns[:-1])
tokyo_onehot = tokyo_onehot[fixed_columns]
tokyo_onehot.head()

Unnamed: 0,Borough,ATM,Accessories Store,American Restaurant,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wagashi Place,Wine Bar,Wine Shop,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant
0,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chiyoda,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [187]:
tokyo_onehot.shape

(4062, 269)

**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.**

In [189]:
tokyo_grouped = tokyo_onehot.groupby('Borough').mean().reset_index()
tokyo_grouped.head()

Unnamed: 0,Borough,ATM,Accessories Store,American Restaurant,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wagashi Place,Wine Bar,Wine Shop,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant
0,Akabane,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0
1,Arakawa,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bunkyō,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.04,0.0,0.01
3,Chiyoda,0.0,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.03,0.0,0.01
4,Chūō,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01


In [191]:
tokyo_grouped.shape

(36, 269)

In [192]:
tokyo_grouped.to_csv('tokyo_grouped.csv')

In [193]:
# look at the top n venue types in toronto each postal code
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create top 10 venues for each postal code.

In [196]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Borough'] = tokyo_grouped['Borough']

for ind in np.arange(tokyo_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tokyo_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Akabane,Sake Bar,Convenience Store,Ramen Restaurant,BBQ Joint,Soba Restaurant,Coffee Shop,Clothing Store,Bar,Italian Restaurant,Indian Restaurant
1,Arakawa,Convenience Store,Grocery Store,Ramen Restaurant,Tram Station,Italian Restaurant,Noodle House,Intersection,Donburi Restaurant,Chinese Restaurant,Park
2,Bunkyō,Ramen Restaurant,Sake Bar,Café,Italian Restaurant,BBQ Joint,Hotel,Yakitori Restaurant,Japanese Curry Restaurant,Baseball Stadium,Indian Restaurant
3,Chiyoda,Dessert Shop,Café,Japanese Restaurant,Sushi Restaurant,Chinese Restaurant,South Indian Restaurant,Liquor Store,Gourmet Shop,French Restaurant,Bakery
4,Chūō,Sushi Restaurant,Japanese Restaurant,Monjayaki Restaurant,Italian Restaurant,Soba Restaurant,Ramen Restaurant,Seafood Restaurant,Bakery,Coffee Shop,Donburi Restaurant


### Cluster Neighborhood analysis 

In [197]:
# set number of clusters
kclusters = 5

tokyo_grouped_clustering = tokyo_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tokyo_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([2, 3, 2, 1, 0, 3, 2, 2, 3, 3, 2, 2, 1, 2, 2, 4, 4, 1, 1, 3],
      dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [198]:
# add clustering labels
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
neighborhoods_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,Akabane,Sake Bar,Convenience Store,Ramen Restaurant,BBQ Joint,Soba Restaurant,Coffee Shop,Clothing Store,Bar,Italian Restaurant,Indian Restaurant,2
1,Arakawa,Convenience Store,Grocery Store,Ramen Restaurant,Tram Station,Italian Restaurant,Noodle House,Intersection,Donburi Restaurant,Chinese Restaurant,Park,3
2,Bunkyō,Ramen Restaurant,Sake Bar,Café,Italian Restaurant,BBQ Joint,Hotel,Yakitori Restaurant,Japanese Curry Restaurant,Baseball Stadium,Indian Restaurant,2
3,Chiyoda,Dessert Shop,Café,Japanese Restaurant,Sushi Restaurant,Chinese Restaurant,South Indian Restaurant,Liquor Store,Gourmet Shop,French Restaurant,Bakery,1
4,Chūō,Sushi Restaurant,Japanese Restaurant,Monjayaki Restaurant,Italian Restaurant,Soba Restaurant,Ramen Restaurant,Seafood Restaurant,Bakery,Coffee Shop,Donburi Restaurant,0


In [199]:
df.head()

Unnamed: 0,Borough,City,Latitude,Longitude
0,Chiyoda,Tokyo,35.6778,139.765
1,Nagatachō,Tokyo,35.6756,139.743
2,Chūō,Tokyo,35.6663,139.776
3,Nihonbashi,Tokyo,35.6841,139.775
4,Minato,Tokyo,35.6432,139.74


In [200]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_sum = pd.merge(df, neighborhoods_venues_sorted, on='Borough')

df_sum.head() # check the last columns!

Unnamed: 0,Borough,City,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,Chiyoda,Tokyo,35.6778,139.765,Dessert Shop,Café,Japanese Restaurant,Sushi Restaurant,Chinese Restaurant,South Indian Restaurant,Liquor Store,Gourmet Shop,French Restaurant,Bakery,1
1,Nagatachō,Tokyo,35.6756,139.743,Japanese Restaurant,BBQ Joint,Hotel,Coffee Shop,Chinese Restaurant,Szechuan Restaurant,Steakhouse,Theater,Ramen Restaurant,Yakitori Restaurant,1
2,Chūō,Tokyo,35.6663,139.776,Sushi Restaurant,Japanese Restaurant,Monjayaki Restaurant,Italian Restaurant,Soba Restaurant,Ramen Restaurant,Seafood Restaurant,Bakery,Coffee Shop,Donburi Restaurant,0
3,Nihonbashi,Tokyo,35.6841,139.775,Japanese Restaurant,Café,Department Store,BBQ Joint,Hotel,Soba Restaurant,Gift Shop,Beer Bar,Bakery,Yoshoku Restaurant,1
4,Minato,Tokyo,35.6432,139.74,Japanese Restaurant,Chinese Restaurant,Italian Restaurant,Coffee Shop,Soba Restaurant,Sake Bar,Indian Restaurant,Bistro,Ramen Restaurant,BBQ Joint,1


In [201]:
df_sum.to_csv('tokyo_sum.csv')

In [202]:
# get toronto lat and lon
address = 'Tokyo, Japan'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

35.6828387 139.7594549


In [203]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_sum['Latitude'], df_sum['Longitude'], df_sum['Borough'], df_sum['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

One interesting thing is that the cluster0, 2, 3 only has one postal code. Let's check what makes it unique.

In [290]:
# check cluster 0 first
c_0 = df_sum.loc[df_sum['Cluster Labels']==0, :].PostalCode.values[0]
toronto_grouped[toronto_grouped['Postal Code'] == c_0].T[1:].sort_values(by=31, ascending=False)[:5]

Unnamed: 0,31
Vietnamese Restaurant,0.5
Food Truck,0.25
Baseball Field,0.25
Accessories Store,0.0
Nightclub,0.0


In [287]:
# check cluster 2
c_2 = df_sum.loc[df_sum['Cluster Labels']==2, :].PostalCode.values[0]
toronto_grouped[toronto_grouped['Postal Code'] == c_2].T[1:].sort_values(by=101, ascending=False)[:5]

Unnamed: 0,101
Coffee Shop,0.5
Lounge,0.5
Accessories Store,0.0
Other Great Outdoors,0.0
Organic Grocery,0.0


In [289]:
# check cluster 3
c_3 = df_sum.loc[df_sum['Cluster Labels']==3, :].PostalCode.values[0]
toronto_grouped[toronto_grouped['Postal Code'] == c_3].T[1:].sort_values(by=19, ascending=False)[:5]

Unnamed: 0,19
Park,0.75
Pool,0.25
Accessories Store,0.0
New American Restaurant,0.0
Organic Grocery,0.0


In [310]:
# check cluster 1
c_1 = df_sum.loc[df_sum['Cluster Labels']==1, :].PostalCode.values
toronto_grouped[toronto_grouped['Postal Code'].isin(c_1)].iloc[:,1:].mean().sort_values(ascending=False)[:10]

Coffee Shop            0.082615
Café                   0.045896
Restaurant             0.028735
Pizza Place            0.027611
Park                   0.025748
Italian Restaurant     0.022873
Bakery                 0.022656
Japanese Restaurant    0.018203
Grocery Store          0.018132
Sandwich Place         0.017383
dtype: float64

In [309]:
# check cluster 3
c_4 = df_sum.loc[df_sum['Cluster Labels']==4, :].PostalCode.values
toronto_grouped[toronto_grouped['Postal Code'].isin(c_4)].iloc[:,1:].mean().sort_values(ascending=False)[:10]

Park                    0.088410
Pizza Place             0.069247
Coffee Shop             0.061051
Grocery Store           0.043590
Pharmacy                0.043048
Bank                    0.035407
Convenience Store       0.033504
Chinese Restaurant      0.027027
Fast Food Restaurant    0.024118
Sandwich Place          0.023073
dtype: float64

**Conclusion:**

- We can see that the cluster 0, 2, 3 have too little information. Cluster 0 seems to have vietnam resturant, food truck and basketball field. Cluster 2 has coffee shop and lounge. Cluster 3 has pool and park. <br/>
- Cluster 1 has a lot of coffee shops, cafe, resturants and pizza/italian resturants. From the map cluster 1 are clustered in downtown Toronto. <br/>
- Cluster 4 has parks, pizza place, coffee shop, some grocery store and covenience stores. From the map, they are clustered in the boroughs surround downtown, such as North York and Scarborough.