# Segmenting and Clustering Neighborhoods in Toronto

## by Sharif A

In [1]:
# First we will be installing beautiful soup in order to scrape through the wikipedia page to obtain all the info needed

!pip install BeautifulSoup4
!pip install lxml
!pip install request
!conda install -c conda-forge geocoder --yes
!conda install -c conda-forge geopy --yes

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 24.9MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.8.2 soupsieve-2.0
Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 8.4MB/s eta 0:00:01     |████████████████████████████▌   | 5.1MB 8.4MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
C

In [102]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

import json
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

Using Beautifulsoup I am scraping the data off the wiki page:

In [103]:
#Creating our data frame by scraping the wiki page using the suggested Beautiful soup method.

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')

Table = soup.find('table')
rows= Table.find_all('tr')
our_columns =[r.text.replace('\n','') for r in rows[0].find_all('th')]
df = pd.DataFrame(columns=our_columns)

for i in range (1,len(rows)):
    table_ds=rows[i].find_all('td')
    #print(tds)
    values=[table_ds[0].text,table_ds[1].text,table_ds[2].text.replace('\n','')]
    df=df.append(pd.Series(values,index=our_columns),ignore_index=True)
print(df)



    Postcode           Borough          Neighbourhood
0        M1A      Not assigned           Not assigned
1        M2A      Not assigned           Not assigned
2        M3A        North York              Parkwoods
3        M4A        North York       Victoria Village
4        M5A  Downtown Toronto           Harbourfront
..       ...               ...                    ...
282      M8Z         Etobicoke              Mimico NW
283      M8Z         Etobicoke     The Queensway West
284      M8Z         Etobicoke  Royal York South West
285      M8Z         Etobicoke         South of Bloor
286      M9Z      Not assigned           Not assigned

[287 rows x 3 columns]


Now we drop the rows with 'Borough' values that are 'Not assigned'

In [104]:
df.drop(df.loc[df['Borough']=='Not assigned'].index,inplace=True)
#This removes the vales with boroughs that are not assigned.

Checking that the values in the dataframe are correct:

In [105]:
print(df)

    Postcode           Borough             Neighbourhood
2        M3A        North York                 Parkwoods
3        M4A        North York          Victoria Village
4        M5A  Downtown Toronto              Harbourfront
5        M6A        North York          Lawrence Heights
6        M6A        North York            Lawrence Manor
..       ...               ...                       ...
281      M8Z         Etobicoke  Kingsway Park South West
282      M8Z         Etobicoke                 Mimico NW
283      M8Z         Etobicoke        The Queensway West
284      M8Z         Etobicoke     Royal York South West
285      M8Z         Etobicoke            South of Bloor

[210 rows x 3 columns]


Joining the different Neighbourhoods together according to the postcode: 

In [106]:
df_new= df.groupby(by=['Postcode','Borough']).agg(lambda x: ', '.join(x))
df_new=df_new.reset_index()
df_new

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [107]:
df_new['Neighbourhood']=df_new['Neighbourhood'].replace('Not assigned',df['Borough'])


In [108]:
df_new.shape

(103, 3)

geolocator =Nominatim(user_agent="ca_explorer")
location = geolocator.geocode('M1B, Toronto, CA')
print(location.latitude)

for row in df_new['Postcode']:
    g=geolocator.geocode('{}, Toronto, Ontario'.format(row))
    print(g)

def get_geocoder(code_from_df):
    lat_lng_coords = []
    N=0
    while (N<130):
        g = geolocator.geocode('{}, Toronto, CA'.format(code_from_df))
        latitudes = g.latitude
        longitudes = g.longitude
        N+=1
        return latitudes,longitudes
for i in range(0,(len(df_new['Postcode']))):
    df_new.iloc[i]['Latitude'],df_new.iloc[i]['Longitude']=get_geocoder(df_new.iloc[i]['Postcode'])

In [109]:
#This is the csv file that contains the coordinates
extr_df = pd.read_csv("http://cocl.us/Geospatial_data")
extr_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Now we merge the df_new (which is our original dataframe) with the csv file containing coordinates:

In [110]:
merged_df = pd.merge(df_new,extr_df,left_on='Postcode', right_on='Postal Code', how='inner')
merged_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,York,Weston,M9N,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,M9P,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",M9R,43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",M9V,43.739416,-79.588437


***
***
***

#### Visualizing our data on a map of toronto:

In [111]:
#First Find the coordinates of Toronto:

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location_Tor = geolocator.geocode(address)
latitude_Tor = location_Tor.latitude
longitude_Tor = location_Tor.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude_Tor, longitude_Tor))

#Create a Folium object and map:
map_Toronto= folium.Map(location=[latitude_Tor, longitude_Tor],zoom_start =12)
map_Toronto



The geograpical coordinate of New York City are 43.653963, -79.387207.


#### Now we are only going to look at the Neighbourhoods that are in Boroughs in Downtown Toronto:


In [112]:
Toronto_borough = merged_df[merged_df['Borough'].str.contains('Downtown Toronto')].reset_index(drop=True)
Toronto_borough

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,M4W,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",M4X,43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",M5B,43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,M5G,43.657952,-79.387383
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",M5H,43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",M5J,43.640816,-79.381752


In [113]:
CLIENT_ID = 'P2RKBCLNGEUZZPFUB4WOAMEDAOV4MY52ONV5EPK24BSGQERD' # your Foursquare ID
CLIENT_SECRET = 'F3Q3FZTJQGRY4NSD5Q45AIKABKZ5I51TIJGJJE5SWBX2AW5C' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

credentails:
CLIENT_ID: P2RKBCLNGEUZZPFUB4WOAMEDAOV4MY52ONV5EPK24BSGQERD
CLIENT_SECRET:F3Q3FZTJQGRY4NSD5Q45AIKABKZ5I51TIJGJJE5SWBX2AW5C


In [114]:
address_DTor = 'Downtown Toronto'

geolocator_DTor = Nominatim(user_agent="ny_explorer")
location_DTor = geolocator_DTor.geocode(address_DTor)
latitude_DTor = location_DTor.latitude
longitude_DTor = location_DTor.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude_DTor, longitude_DTor))

The geograpical coordinate of Manhattan are 43.6541737, -79.38081164513409.


In [115]:
map_DownTor = folium.Map(location=[latitude_DTor, longitude_DTor], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Toronto_borough['Latitude'], Toronto_borough['Longitude'], Toronto_borough['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_DownTor)  
    
map_DownTor


***
***
***

#### Let's investigate our first neighborhood as give by our Downtown Toronto dataset:

In [116]:
first_neigh_name =Toronto_borough.loc[0,'Neighbourhood']
first_neigh_name

'Rosedale'

In [117]:
# Obtaining the latitude and longitude of Rosedale:
first_neigh_name_lat = Toronto_borough.loc[0,'Latitude']
first_neigh_name_long = Toronto_borough.loc[0,'Longitude']

In [118]:
# now we will fetch the top 50 venues for 'rosedale' by using foursquare API:
limit =100 # number of venues
radius=400 # our radius is 300 meters from the coord

url ='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, first_neigh_name_lat, 
    first_neigh_name_long, 
    radius, 
    limit)

In [119]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e5e5cf347e0d60028af9768'},
 'response': {'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.6831626036, 'lng': -79.37256090619498},
   'sw': {'lat': 43.67596259639999, 'lng': -79.38249789380505}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bae2150f964a520df873be3',
       'name': 'Mooredale House',
       'location': {'address': '146 Crescent Rd.',
        'crossStreet': 'btwn. Lamport Ave. and Mt. Pleasant Rd.',
        'lat': 43.678630645646535,
        'lng': -79.38009142511322,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.678630645646535,
          'lng': -79.3800914251132

In [120]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [121]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Mooredale House,Building,43.678631,-79.380091
1,Mooredale Day Camp,Campground,43.678332,-79.380491
2,Rosedale Park,Playground,43.682328,-79.378934
3,Betline Trail at Roxborough dr.,Bike Trail,43.68053,-79.38149


***
***
***

#### Now let's do the same as before but lets go through all the neighborhoods:


In [122]:
# let's define a function:

def GetNearbyVenues(names, latitudes, longitudes, radius=400):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [123]:
Toronto_borough_venues = GetNearbyVenues(names=Toronto_borough['Neighbourhood'], latitudes=Toronto_borough['Latitude'],longitudes=Toronto_borough['Longitude'])
Toronto_borough_venues                      

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Mooredale House,43.678631,-79.380091,Building
1,Rosedale,43.679563,-79.377529,Mooredale Day Camp,43.678332,-79.380491,Campground
2,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
3,Rosedale,43.679563,-79.377529,Betline Trail at Roxborough dr.,43.680530,-79.381490,Bike Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
...,...,...,...,...,...,...,...
1069,Queen's Park,43.662301,-79.389494,Subway,43.659132,-79.391114,Sandwich Place
1070,Queen's Park,43.662301,-79.389494,Tim Hortons,43.661038,-79.393797,Coffee Shop
1071,Queen's Park,43.662301,-79.389494,Starbucks,43.660887,-79.393720,Coffee Shop
1072,Queen's Park,43.662301,-79.389494,Tim Hortons,43.658906,-79.388696,Coffee Shop


In [124]:
# Analysing the number of venues per neigborhood:
Toronto_borough_venues.groupby('Neighbourhood').count().sort_values(by='Venue',ascending=False)


Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"Ryerson, Garden District",100,100,100,100,100,100
"Harbourfront East, Toronto Islands, Union Station",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Adelaide, King, Richmond",97,97,97,97,97,97
Central Bay Street,76,76,76,76,76,76
"Chinatown, Grange Park, Kensington Market",69,69,69,69,69,69
St. James Town,67,67,67,67,67,67
Church and Wellesley,65,65,65,65,65,65


***
***
***
#### Analysing each neighbourhood by first using one hot coding:

In [125]:
Toronto_onehot = pd.get_dummies(Toronto_borough_venues[['Venue Category']],prefix="",prefix_sep="")
Toronto_onehot['Neighbourhood']=Toronto_borough_venues['Neighbourhood']
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Cabbagetown, St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Grouping each of the neighbourhoods then finding the frequency of each category:

In [126]:
Toronto_onehot_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_onehot_grouped

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.030928,0.0,0.0,...,0.0,0.0,0.020619,0.0,0.0,0.0,0.010309,0.0,0.010309,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.076923,0.076923,0.076923,0.153846,0.076923,0.153846,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,0.0
5,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.057971,0.0,0.057971,0.0,0.014493,0.0,0.0,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.0
8,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
9,"Design Exchange, Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.01,...,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0


#### Now we will create a dataframe to analyse the top 10 venues from each of the neighbourhoods:


In [127]:
# Let first start off by defining our function to sort the venues in descending order:

def common_venues(row,num_venues):
    row_category = row.iloc[1:]
    sort_row_category = row_category.sort_values(ascending=False)
    return sort_row_category.index.values[0:num_venues]

num_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = Toronto_onehot_grouped['Neighbourhood']

for ind in np.arange(Toronto_onehot_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = common_venues(Toronto_onehot_grouped.iloc[ind, :], num_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,American Restaurant,Steakhouse,Sushi Restaurant,Thai Restaurant,Bar,Café,Japanese Restaurant,Burger Joint,Pizza Place
1,Berczy Park,Comfort Food Restaurant,French Restaurant,Park,Liquor Store,Beer Bar,Department Store,Restaurant,Pub,Fountain,Italian Restaurant
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Airport,Bar,Coffee Shop,Rental Car Location,Boutique,Plane,Airport Service,Airport Food Court
3,"Cabbagetown, St. James Town",Restaurant,Coffee Shop,Café,Pizza Place,Liquor Store,Bank,Market,Butcher,Breakfast Spot,Snack Place
4,Central Bay Street,Coffee Shop,Sandwich Place,Chinese Restaurant,Italian Restaurant,Sushi Restaurant,Japanese Restaurant,Thai Restaurant,Indian Restaurant,Tea Room,Spa


***
***
***
#### Clustering our neighourhoods using KMEANS:

In [128]:
Num_clusters = 5

Toronto_onehot_grouped_cluster = Toronto_onehot_grouped.drop('Neighbourhood',1)

Toronto_KMeans= KMeans(n_clusters=Num_clusters,random_state=0)
Toronto_KMeans.fit(Toronto_onehot_grouped_cluster)

Toronto_KMeans.labels_[0:10]

array([3, 1, 0, 3, 3, 1, 3, 3, 3, 3], dtype=int32)

In [129]:
#Now we will include the top venues:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', Toronto_KMeans.labels_)

Toronto_merge = Toronto_borough

Toronto_merge = Toronto_merge.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
Toronto_merge.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,M4W,43.679563,-79.377529,4,Playground,Building,Campground,Bike Trail,Concert Hall,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Dog Run
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",M4X,43.667967,-79.367675,3,Restaurant,Coffee Shop,Café,Pizza Place,Liquor Store,Bank,Market,Butcher,Breakfast Spot,Snack Place
2,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.66586,-79.38316,3,Coffee Shop,Japanese Restaurant,Gay Bar,Restaurant,Hotel,Burger Joint,Bubble Tea Shop,Men's Store,Gym,Dance Studio
3,M5A,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636,3,Coffee Shop,Breakfast Spot,Restaurant,Yoga Studio,Spa,Bakery,Bank,Electronics Store,Greek Restaurant,Gym / Fitness Center
4,M5B,Downtown Toronto,"Ryerson, Garden District",M5B,43.657162,-79.378937,3,Coffee Shop,Clothing Store,Middle Eastern Restaurant,Sandwich Place,Hotel,Café,Diner,Movie Theater,Bar,Ramen Restaurant


### Final map with cluster

In [130]:
Toronto_map_clusters = folium.Map(location=[latitude_DTor, longitude_DTor], zoom_start=12)

# set color scheme for the clusters
x = np.arange(Num_clusters)
ys = [i + x + (i*x)**2 for i in range(Num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merge['Latitude'], Toronto_merge['Longitude'], Toronto_merge['Neighbourhood'], Toronto_merge['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(Toronto_map_clusters)
       
Toronto_map_clusters