# Segmenting and Clustering Neighborhoods in Toronto

### Read data from webpage

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

#Read data from Wikipedia page
toronto_rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#assign the table data to a new data frame
dfTorontoRaw = toronto_rawdata[0]

#Assignment requires three columns: PostalCode, Borough, and Neighborhood. Rename the column "Postcal Code" to "PostcalCode"
dfTorontoRaw.rename(columns = {'Postal Code':'PostalCode'}, inplace = True) 

#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
#Filter out the not assigned borough
dfToronto_AssignedBorough = dfTorontoRaw[dfTorontoRaw['Borough']!='Not assigned']

#merge Neighbourhood with same PostcalCode
dtToronto_Merged = dfToronto_AssignedBorough.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
#CHeck if there is any not assigned neighborhoud, The merge happened first, a not-assigned neighborhood could be merged
dtToronto_Merged.loc[(dtToronto_Merged['Neighbourhood'].str.contains('Not assigned')),'Neighbourhood'] = dtToronto_Merged.loc[(dtToronto_Merged['Neighbourhood'].str.contains('Not assigned')),'Borough']


In [2]:
#use the .shape method to print the number of rows of your dataframe.
dtToronto_Merged.shape

(103, 3)

In [34]:
#Install the geocoder package
!pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 6.8 MB/s eta 0:00:011
[?25hCollecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [37]:
import geocoder # import geocoder

#********************************************* 
#Give a try, the geocoder doesn't work
#********************************************* 

def get_geocoder(postal_code_from_df):
     # initialize your variable to None
     lat_lng_coords = None
     # loop until you get the coordinates
     while(lat_lng_coords is None):
       g = geocoder.google('{}, Toronto, Ontario'.format(postal_code_from_df))
       lat_lng_coords = g.latlng
     latitude = lat_lng_coords[0]
     longitude = lat_lng_coords[1]
     return latitude,longitude

#testlat,testlong = get_geocoder('M5G')
print('Geocoder doesnot work, load from csv')

#for i in range(0,len(dtToronto_Merged)):
#    dtToronto_Merged['Latitude'][i],dtToronto_Merged['Longitude'][i]=get_geocoder(dtToronto_Merged.iloc[i]['PostalCode'])


Geocoder doesnot work, load from csv


In [3]:
#Read CSV file to get Lat/Long from Postal Code
dfPostalCode2LatLong = pd.read_csv('https://cocl.us/Geospatial_data')

In [4]:
#Merge using Inner Join
dfToronto = pd.merge(left=dtToronto_Merged, right=dfPostalCode2LatLong, left_on='PostalCode', right_on='Postal Code')

dfToronto.drop('Postal Code', axis=1, inplace=True)

#Size of new data.
dfToronto.shape
dfToronto


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [5]:
#only borough contains Toronto
dfTorontoOnly = dfToronto[dfToronto['Borough'].str.contains('Toronto')]
dfTorontoOnly.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [7]:
#install folium (the below install doesnot work)
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.0 MB/s eta 0:00:011
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [8]:
from pandas import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


### Display Toronto area map

In [9]:
#Googole Toronto and find its latitude and longitude
latitude = 43.6532
longitude = -79.3832
# create map of Toronto area using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)


#### Set each neighborhoods containing "Tonronto" as markers to the Toronto Map

In [11]:
# add markers to map
for lat, lng, borough, neighborhood in zip(dfTorontoOnly['Latitude'], dfTorontoOnly['Longitude'], dfTorontoOnly['Borough'], dfTorontoOnly['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

#### Prepare to use the Foursquare API to explore the neighborhoods and segment

In [12]:
CLIENT_ID = 'B2ZH2JO3RL4G5LRC3OGRI53GM4OFDNMK42Y1UCASPLRPRIJD' # your Foursquare ID
CLIENT_SECRET = 'SHRAIHQ3E1DGLRND5IXDWPDEC1NQE0QXWABLK4YBWI2IJRD2' # your Foursquare Secret
VERSION = '20201124'
LIMIT = 50 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: B2ZH2JO3RL4G5LRC3OGRI53GM4OFDNMK42Y1UCASPLRPRIJD
CLIENT_SECRET:SHRAIHQ3E1DGLRND5IXDWPDEC1NQE0QXWABLK4YBWI2IJRD2


#### Check the venue around one neighbourhood (the first one)

###### Import request for calling FourSquare API

In [18]:
import requests # library to handle requests

#### Call to get venues around one neighborhood (the first one)

In [20]:
#Get the first neighborhood's latitude and longitude values.
neighborhood_latitude = dfTorontoOnly['Latitude'].iloc[0] # neighborhood latitude value
neighborhood_longitude = dfTorontoOnly['Longitude'].iloc[0] # neighborhood longitude value

neighborhood_name = dfTorontoOnly['Neighbourhood'].iloc[0] # neighborhood name

search_query = neighborhood_name
radius = 500

#define URL
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)

#Send the GET Request and get results
results = requests.get(url).json()
venues = results['response']['venues']

In [22]:
nearby_venues = json_normalize(venues)
nearby_venues

Unnamed: 0,id,name,categories,referralId,hasPerk,location.lat,location.lng,location.labeledLatLngs,location.distance,location.cc,location.city,location.state,location.country,location.formattedAddress,location.postalCode,venuePage.id,location.address,location.crossStreet,location.neighborhood
0,59ba3308f870fd242067a05f,The Rec Room Roundhouse,"[{'id': '4bf58dd8d48988d1e5931735', 'name': 'M...",v-1606605382,False,43.653226,-79.383184,"[{'label': 'display', 'lat': 43.653226, 'lng':...",3,CA,Toronto,ON,Canada,"[Toronto ON, Canada]",,,,,
1,5a2c551a95d986072897bb18,Christmas Fair In The Square,"[{'id': '4bf58dd8d48988d1f1931735', 'name': 'G...",v-1606605382,False,43.652734,-79.383543,"[{'label': 'display', 'lat': 43.652734, 'lng':...",58,CA,Toronto,ON,Canada,"[Toronto ON M5H 2N2, Canada]",M5H 2N2,,,,
2,5f7f6c030eed4b4a8296b37e,Dog Walking in the Village,"[{'id': '5032897c91d4c4b30a586d69', 'name': 'P...",v-1606605382,False,43.652728,-79.383532,"[{'label': 'display', 'lat': 43.65272847479262...",58,CA,Toronto,ON,Canada,"[Toronto ON, Canada]",,592830878.0,,,
3,4f23983ee4b0476578d9c52f,The Supermarket,"[{'id': '4bf58dd8d48988d116941735', 'name': 'B...",v-1606605382,False,43.653247,-79.381865,"[{'label': 'display', 'lat': 43.65324668644528...",107,CA,Toronto,ON,Canada,"[Augusta Ave (College), Toronto ON, Canada]",,,Augusta Ave,College,
4,4e5b997218388cd5cbbeb1c1,The Factory,[],v-1606605382,False,43.653524,-79.383907,"[{'label': 'display', 'lat': 43.653524, 'lng':...",67,CA,Toronto,ON,Canada,"[3593 Lakeshore Blvd West, Toronto ON, Canada]",,,3593 Lakeshore Blvd West,,
5,5255f0bc11d2121363221aa3,BRIKA Popup Shop at The Hudson Bay,"[{'id': '4bf58dd8d48988d1ff941735', 'name': 'M...",v-1606605382,False,43.652608,-79.383581,"[{'label': 'display', 'lat': 43.65260822309432...",72,CA,,,Canada,[Canada],,,,,
6,5cce76ece1f0aa002caa2e08,In Front Of The Line,"[{'id': '52e81612bcbc57f1066b7a33', 'name': 'S...",v-1606605382,False,43.653388,-79.38401,"[{'label': 'display', 'lat': 43.65338830142172...",68,CA,Toronto,ON,Canada,"[330 Bay St Suite 1400, Toronto ON M5H 2S8, Ca...",M5H 2S8,,330 Bay St Suite 1400,,
7,5e72bd0bbf653f0008a7624d,The Source,"[{'id': '4bf58dd8d48988d122951735', 'name': 'E...",v-1606605382,False,43.652721,-79.38041,"[{'label': 'display', 'lat': 43.652721, 'lng':...",230,CA,Toronto,ON,Canada,"[220 Yonge St, Unit 1114, Toronto ON M5B 2H1, ...",M5B 2H1,,"220 Yonge St, Unit 1114",,
8,4fa43f81e4b098f42a5681a1,the Archer / Three-Way Piece No. 2,"[{'id': '4bf58dd8d48988d162941735', 'name': 'O...",v-1606605382,False,43.652622,-79.383923,"[{'label': 'display', 'lat': 43.652622, 'lng':...",86,CA,Toronto,ON,Canada,"[Nathan Phillips Square, Toronto ON, Canada]",,,Nathan Phillips Square,,
9,4e8889c961afee1b75b4cc4a,Jimmy The Greek,"[{'id': '4bf58dd8d48988d10e941735', 'name': 'G...",v-1606605382,False,43.654503,-79.380888,"[{'label': 'display', 'lat': 43.6545029224916,...",236,CA,Toronto,ON,Canada,"[220 Yonge St. (in Urban Eatery, Toronto Eaton...",M5B 2H1,,220 Yonge St.,"in Urban Eatery, Toronto Eaton Centre",


##### Only name, category, lat and long columns

In [23]:
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues

Unnamed: 0,name,categories,location.lat,location.lng
0,The Rec Room Roundhouse,"[{'id': '4bf58dd8d48988d1e5931735', 'name': 'M...",43.653226,-79.383184
1,Christmas Fair In The Square,"[{'id': '4bf58dd8d48988d1f1931735', 'name': 'G...",43.652734,-79.383543
2,Dog Walking in the Village,"[{'id': '5032897c91d4c4b30a586d69', 'name': 'P...",43.652728,-79.383532
3,The Supermarket,"[{'id': '4bf58dd8d48988d116941735', 'name': 'B...",43.653247,-79.381865
4,The Factory,[],43.653524,-79.383907
5,BRIKA Popup Shop at The Hudson Bay,"[{'id': '4bf58dd8d48988d1ff941735', 'name': 'M...",43.652608,-79.383581
6,In Front Of The Line,"[{'id': '52e81612bcbc57f1066b7a33', 'name': 'S...",43.653388,-79.38401
7,The Source,"[{'id': '4bf58dd8d48988d122951735', 'name': 'E...",43.652721,-79.38041
8,the Archer / Three-Way Piece No. 2,"[{'id': '4bf58dd8d48988d162941735', 'name': 'O...",43.652622,-79.383923
9,Jimmy The Greek,"[{'id': '4bf58dd8d48988d10e941735', 'name': 'G...",43.654503,-79.380888


##### Reprocess category

In [24]:
#define get_category_type function
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
# filter the category for each row
nearby_venues['categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues

Unnamed: 0,name,categories,location.lat,location.lng
0,The Rec Room Roundhouse,Music Venue,43.653226,-79.383184
1,Christmas Fair In The Square,General Entertainment,43.652734,-79.383543
2,Dog Walking in the Village,Pet Service,43.652728,-79.383532
3,The Supermarket,Bar,43.653247,-79.381865
4,The Factory,,43.653524,-79.383907
5,BRIKA Popup Shop at The Hudson Bay,Miscellaneous Shop,43.652608,-79.383581
6,In Front Of The Line,Social Club,43.653388,-79.38401
7,The Source,Electronics Store,43.652721,-79.38041
8,the Archer / Three-Way Piece No. 2,Other Great Outdoors,43.652622,-79.383923
9,Jimmy The Greek,Greek Restaurant,43.654503,-79.380888


#### Get all venues around neighborhood with name containing "Toronto"

In [26]:
#Explore Neighborhoods (name containing 'Toronto') in Toronto area
#create a function to repeat the same process to all the neighborhoods in Toronto area
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        #print(url)   
        # make the GET request
        
        results = requests.get(url).json()["response"]['groups']
        #results
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results[0]['items']])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=dfTorontoOnly['Neighbourhood'],
                                   latitudes=dfTorontoOnly['Latitude'],
                                   longitudes=dfTorontoOnly['Longitude']
                                  )

toronto_venues.head()

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


#### Check number of venues returned for each neighborhood

In [29]:
#check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,50,50,50,50,50,50
"Brockton, Parkdale Village, Exhibition Place",25,25,25,25,25,25
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,50,50,50,50,50,50
Christie,16,16,16,16,16,16
Church and Wellesley,50,50,50,50,50,50
"Commerce Court, Victoria Hotel",50,50,50,50,50,50
Davisville,34,34,34,34,34,34
Davisville North,9,9,9,9,9,9


##### Find top 5 categories for each neighborhood

In [34]:
#analyze each neighbourhood
# one hot encoding
Toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

#print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.08
1      Farmers Market  0.04
2  Seafood Restaurant  0.04
3              Bakery  0.04
4          Restaurant  0.04


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.12
1     Coffee Shop  0.08
2       Nightclub  0.08
3  Breakfast Spot  0.08
4             Gym  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                venue  freq
0          Restaurant  0.06
1    Recording Studio  0.06
2          Comic Shop  0.06
3       Burrito Place  0.06
4  Light Rail Station  0.06


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
             venue  freq
0   Airport Lounge  0.12
1  Airport Service  0.12
2         Boutique  0.06
3          Airport  0.06
4            Plane  0.06


----Central Bay Street----
                    venue  freq
0             Coffee S

#### Found top 10 venue

In [37]:
#Write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


#create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Bakery,Beer Bar,Cocktail Bar,Seafood Restaurant,Cheese Shop,Restaurant,Farmers Market,Japanese Restaurant,Beach
1,"Brockton, Parkdale Village, Exhibition Place",Café,Nightclub,Coffee Shop,Breakfast Spot,Performing Arts Venue,Bakery,Stadium,Restaurant,Intersection,Italian Restaurant
2,"Business reply mail Processing Centre, South C...",Garden,Gym / Fitness Center,Light Rail Station,Brewery,Burrito Place,Restaurant,Butcher,Recording Studio,Farmers Market,Fast Food Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Boat or Ferry,Coffee Shop,Boutique,Rental Car Location,Bar,Historic Site,Harbor / Marina,Plane
4,Central Bay Street,Coffee Shop,Italian Restaurant,Burger Joint,Café,New American Restaurant,Japanese Restaurant,Juice Bar,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant


##### Run K-Means to cluster the neighborhood into 5 clusters.

In [49]:
#Run k-means to cluster the neighborhood into 5 clusters.
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

#create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = dfTorontoOnly


# merge Toronto_grouped with Toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head() # check the last columns!

#visualize the resulting clusters
# create map
latitude = 43.6532
longitude = -79.3832
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [47]:
print('Thanks')

Thanks
