# Segmenting and Clustering Neighborhoods in Toronto

## Part 1

#### Install and import libraries

In [24]:
# libraries
import pandas as pd
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe

# install beautifulsoup4
!pip install beautifulsoup4
from bs4 import BeautifulSoup

import requests

import numpy as np

# install geopy
!pip install geopy 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

# install scikit-learn
!pip install -U scikit-learn 
from sklearn.cluster import KMeans # import k-means from clustering stage

import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html

Requirement already up-to-date: scikit-learn in c:\users\hogar\appdata\local\programs\arcgis\pro\bin\python\envs\arcgispro-py3\lib\site-packages (0.24.2)


#### Scraping raw data from Wikipedia and creating dataframe

In [40]:
# fetching Wikipedia page data using requests
data = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001695').text

In [41]:
# parsing data using BeautifulSoup
soup = BeautifulSoup(data, 'html.parser')

In [42]:
# initiating lists that will be used to create dataframe
postalCodeList = []
boroughList = []
neighborhoodList = []

In [43]:
# looking for any tables on the Wikipedia page
soup.find('table').find_all('tr')

soup.find('table').find_all('tr')

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [48]:
# looking for the necessary information we need to create our dataframe
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [65]:
# creating the dataframe with the columns we want using pandas
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


#### Cleaning "n/" of column

In [66]:
# cleaning up the column values and taking off the unnessary "\n"
toronto_df['Neighborhood'] = toronto_df['Neighborhood'].str.replace("\n","")
toronto_df['Borough'] = toronto_df['Borough'].str.replace("\n","")
toronto_df['PostalCode'] = toronto_df['PostalCode'].str.replace("\n","")
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Ignoring cells with value "Not Assigned"

In [67]:
# ignoring the cells that have Borough =! 'Not Assigned'
toronto_df_drop = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Grouping neighborhoods with the same borough

In [149]:
# grouping the neighborhoods with the same Borough on the same row
toronto_df_grouped = toronto_df_drop.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.rename(columns={'PostalCode':'Postcode'},inplace=True)
toronto_df_grouped

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae, Cedarbrae"
...,...,...,...
98,M9N,York,"Weston, Weston, Weston"
99,M9P,Etobicoke,"Westmount, Westmount, Westmount"
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [150]:
toronto_df_grouped.shape

(103, 3)

#### Making Neighborhood "Not assigned", same as Borough name

In [84]:
# making Neighborhood == 'Not Assigned' to same Borough name
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head()


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern, Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ..."
3,M1G,Scarborough,"Woburn, Woburn, Woburn"
4,M1H,Scarborough,"Cedarbrae, Cedarbrae, Cedarbrae"


## Part 2

#### Importing geospatial_coordinate.csv

In [88]:
# Importing the csv file conatining the latitudes/longitudes for various neighbourhoods in Canada
toronto_geo_coor = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")
toronto_geo_coor.rename(columns={'Postal Code':'Postcode'},inplace=True)
df_geo_coor.head()


Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging the two tables

In [90]:
# Merging the tables: the Latitudes/Longitudes and neighbourhoods
toronto_df_merging = pd.merge(toronto_df_grouped, toronto_geo_coor, on='Postcode')
toronto_df_merging.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern, Rouge, Malvern, Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union, Highla...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill, Guildwood, ...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn, Woburn, Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae, Cedarbrae, Cedarbrae",43.773136,-79.239476


## Part 3

#### Selecting rows with "Toronto" word

In [108]:
# Getting all the rows from the data frame which contains Toronto in their Borough
toronto_word = toronto_df_merging[toronto_df_merging['Borough'].str.contains('Toronto',regex=False)].reset_index(drop=True)
toronto_word.head()


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale, The Danforth Wes...",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar, The Beaches We...",43.668999,-79.315572
3,M4M,East Toronto,"Studio District, Studio District, Studio District",43.659526,-79.340923
4,M4N,Central Toronto,"Lawrence Park, Lawrence Park, Lawrence Park",43.72802,-79.38879


#### Getting latitude and longitude of Toronto


In [103]:
# getting the latitude/longitude of Toronto
address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto city are 43.6534817, -79.3839347.


#### Creating Toronto map and adding map markers


In [106]:
# creating map of Toronto using latitude/longitude values
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=10)

# adding in the markers for Toronto neighborhoods
for lat,lng,borough,neighbourhood in zip(
        toronto_word['Latitude'],
        toronto_word['Longitude'],
        toronto_word['Borough'],
        toronto_word['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

#### Defining Foursquare Credentials 

In [109]:
CLIENT_ID = '40NILBPPO50ZRI5NLFXRLXJOFM2VVIGPK4GZ15EPQULLFATR' # your Foursquare ID
CLIENT_SECRET = 'NZRHTX2GIN0QK5AWM2W1DTIBMUMKBW1ZEIUTKXTU5SN4QJZI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 40NILBPPO50ZRI5NLFXRLXJOFM2VVIGPK4GZ15EPQULLFATR
CLIENT_SECRET:NZRHTX2GIN0QK5AWM2W1DTIBMUMKBW1ZEIUTKXTU5SN4QJZI


#### Explore the first neighborhood in our dataframe

In [110]:
# assigning a value to neighborhood_name with which we'll work with
neighborhood_name = toronto_word.loc[3, 'Neighborhood']
print(f"The first neighborhood's name is '{neighborhood_name}'.")

The first neighborhood's name is 'Studio District, Studio District, Studio District'.


In [111]:
# obtaining the longitude and latitude of the neighborhood we are using
neighborhood_latitude = toronto_word.loc[3, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_word.loc[3, 'Longitude'] # neighborhood longitude value

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Studio District, Studio District, Studio District are 43.6595255, -79.340923.


In [113]:
# just a sanity check to see if our indexing was correct above
toronto_word.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale, The Danforth Wes...",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar, The Beaches We...",43.668999,-79.315572
3,M4M,East Toronto,"Studio District, Studio District, Studio District",43.659526,-79.340923
4,M4N,Central Toronto,"Lawrence Park, Lawrence Park, Lawrence Park",43.72802,-79.38879


#### Getting top 100 venues that are within a radius of 500 m from Studio District

In [114]:
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET, 
    VERSION,
    neighborhood_latitude, 
    neighborhood_longitude, 
    500, 
    100)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=40NILBPPO50ZRI5NLFXRLXJOFM2VVIGPK4GZ15EPQULLFATR&client_secret=NZRHTX2GIN0QK5AWM2W1DTIBMUMKBW1ZEIUTKXTU5SN4QJZI&v=20180605&ll=43.6595255,-79.340923&radius=500&limit=100'

In [116]:
# printing the results we obtained from scraping
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60b1299e89fd3c71695717d0'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Leslieville',
  'headerFullLocation': 'Leslieville, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 36,
  'suggestedBounds': {'ne': {'lat': 43.6640255045, 'lng': -79.33471445573701},
   'sw': {'lat': 43.6550254955, 'lng': -79.347131544263}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad7e958f964a520001021e3',
       'name': "Ed's Real Scoop",
       'location': {'address': '920 Queen St. E',
        'crossStreet': 'btwn Logan Ave. & Morse St.',
        'lat': 43.660655832455014,
        'lng': -79.3420187548006,
        'labeledLatLngs': 

In [117]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Searching nearby venues

In [118]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Ed's Real Scoop,Ice Cream Shop,43.660656,-79.342019
1,Queen Books,Bookstore,43.660651,-79.342267
2,Te Aro,Coffee Shop,43.661373,-79.338577
3,The Bone House,Pet Store,43.660894,-79.341097
4,Hooked,Fish Market,43.660407,-79.343257


#### Number of venues obtanied

In [119]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

36 venues were returned by Foursquare.


#### Exploring neighborhoods of Toronto 

In [120]:
# function for getting the venues nearby 
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Create a new dataframe toronto_word_venues

In [121]:
# function to get venues nearby for each Neighborhood
toronto_word_venues = getNearbyVenues(names=toronto_word['Neighborhood'],
                                   latitudes=toronto_word['Latitude'],
                                   longitudes=toronto_word['Longitude']
                                  )

In [124]:
print(toronto_word_venues.shape)
toronto_word_venues.head()

(1568, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale, The Danforth Wes...",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


In [126]:
# group by neighborhood
toronto_word_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond, Adelaide, King, Richmond, Adelaide, King, Richmond",92,92,92,92,92,92
"Berczy Park, Berczy Park, Berczy Park",59,59,59,59,59,59
"Brockton, Exhibition Place, Parkdale Village, Brockton, Exhibition Place, Parkdale Village, Brockton, Exhibition Place, Parkdale Village",24,24,24,24,24,24
"Business Reply Mail Processing Centre 969 Eastern, Business Reply Mail Processing Centre 969 Eastern, Business Reply Mail Processing Centre 969 Eastern",16,16,16,16,16,16
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara, CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara, CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town, Cabbagetown, St. James Town, Cabbagetown, St. James Town",43,43,43,43,43,43
"Central Bay Street, Central Bay Street, Central Bay Street",62,62,62,62,62,62
"Chinatown, Grange Park, Kensington Market, Chinatown, Grange Park, Kensington Market, Chinatown, Grange Park, Kensington Market",61,61,61,61,61,61
"Christie, Christie, Christie",16,16,16,16,16,16
"Church and Wellesley, Church and Wellesley, Church and Wellesley",79,79,79,79,79,79


In [129]:
# print the number of distinct categories
print('There are {} uniques categories.'.format(len(toronto_word_venues['Venue Category'].unique())))

There are 229 uniques categories.


#### Analyzing each neighborhood

In [130]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_word_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_word_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
#Create dataframe where venues are grouped by frequency for each Neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,"Adelaide, King, Richmond, Adelaide, King, Rich...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,...,0.01087,0.0,0.0,0.0,0.0,0.0,0.01087,0.0,0.0,0.0
1,"Berczy Park, Berczy Park, Berczy Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 East...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town, Cabbagetown, St. ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Central Bay Street, Central Bay Street, Centra...",0.016129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.016129,0.0,0.0,0.016129
7,"Chinatown, Grange Park, Kensington Market, Chi...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.04918,0.0,0.04918,0.016393
8,"Christie, Christie, Christie",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Church and Wellesley, Church and Wellesley, Ch...",0.025316,0.012658,0.012658,0.0,0.0,0.0,0.0,0.0,0.012658,...,0.012658,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
toronto_grouped.shape

(38, 229)

#### Printing Neighborhood with the top 20 most common venues

In [135]:
num_top_venues = 20

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond, Adelaide, King, Richmond, Adelaide, King, Richmond----
                  venue  freq
0           Coffee Shop  0.11
1                  Café  0.05
2            Restaurant  0.04
3       Thai Restaurant  0.03
4        Clothing Store  0.03
5                   Gym  0.03
6         Deli / Bodega  0.03
7                 Hotel  0.03
8          Concert Hall  0.02
9           Salad Place  0.02
10               Bakery  0.02
11   Seafood Restaurant  0.02
12          Pizza Place  0.02
13        Burrito Place  0.02
14            Bookstore  0.02
15           Steakhouse  0.02
16     Sushi Restaurant  0.02
17       Cosmetics Shop  0.02
18  American Restaurant  0.02
19                Plaza  0.01


----Berczy Park, Berczy Park, Berczy Park----
                          venue  freq
0                   Coffee Shop  0.10
1                  Cocktail Bar  0.07
2                        Bakery  0.05
3                    Restaurant  0.03
4            Seafood Restaurant  0.03
5        

#### Putting the top 20 most common venues into a pandas dataframe

In [138]:
# function to retrieve 20 most common venues for each Neighborhood
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_vsorted = pd.DataFrame(columns=columns)
neighborhoods_vsorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_vsorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_vsorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,"Adelaide, King, Richmond, Adelaide, King, Rich...",Coffee Shop,Café,Restaurant,Hotel,Clothing Store,Deli / Bodega,Thai Restaurant,Gym,Salad Place,...,Pizza Place,Steakhouse,Sushi Restaurant,Burrito Place,American Restaurant,Cosmetics Shop,Bookstore,Concert Hall,Seafood Restaurant,Breakfast Spot
1,"Berczy Park, Berczy Park, Berczy Park",Coffee Shop,Cocktail Bar,Bakery,Restaurant,Pharmacy,Cheese Shop,Seafood Restaurant,Beer Bar,Farmers Market,...,Café,Lounge,Hotel,Shopping Mall,Beach,Liquor Store,Department Store,Park,Breakfast Spot,Bistro
2,"Brockton, Exhibition Place, Parkdale Village, ...",Café,Coffee Shop,Breakfast Spot,Nightclub,Pet Store,Bar,Burrito Place,Restaurant,Climbing Gym,...,Bakery,Office,Convenience Store,Italian Restaurant,Intersection,Furniture / Home Store,Gym,Stadium,Grocery Store,Baby Store
3,Business Reply Mail Processing Centre 969 East...,Light Rail Station,Pizza Place,Skate Park,Brewery,Farmers Market,Spa,Fast Food Restaurant,Restaurant,Burrito Place,...,Auto Workshop,Comic Shop,Park,Garden,Garden Center,Coworking Space,Discount Store,Donut Shop,Comfort Food Restaurant,Doner Restaurant
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Terminal,Airport Lounge,Sculpture Garden,Rental Car Location,Boutique,Plane,Harbor / Marina,Airport Food Court,...,Boat or Ferry,Ethiopian Restaurant,Distribution Center,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Event Space,Doner Restaurant


#### Clustering Neighborhood and K-Means

In [139]:
# initiating the k-means algorithm to cluster the neighborhoods
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [147]:
# clustering top 20 venues for each Neighborhood and creating a dataframe
toronto_merged = toronto_word

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_vsorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,M4E,East Toronto,"The Beaches, The Beaches, The Beaches",43.676357,-79.293031,0,Trail,Health Food Store,Pub,Wine Bar,...,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop,Dance Studio,Deli / Bodega,Event Space,Cupcake Shop
1,M4K,East Toronto,"The Danforth West, Riverdale, The Danforth Wes...",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,...,Liquor Store,Juice Bar,Indian Restaurant,Frozen Yogurt Shop,Spa,Dessert Shop,Cosmetics Shop,Caribbean Restaurant,Café,Bubble Tea Shop
2,M4L,East Toronto,"The Beaches West, India Bazaar, The Beaches We...",43.668999,-79.315572,0,Fast Food Restaurant,Park,Sandwich Place,Ice Cream Shop,...,Pet Store,Brewery,Gym,Movie Theater,Liquor Store,Board Shop,Fish & Chips Shop,Discount Store,Diner,Dessert Shop
3,M4M,East Toronto,"Studio District, Studio District, Studio District",43.659526,-79.340923,0,Coffee Shop,Bakery,Gastropub,Brewery,...,Clothing Store,Pet Store,Comfort Food Restaurant,Yoga Studio,Bar,Middle Eastern Restaurant,Diner,Latin American Restaurant,Italian Restaurant,Fish Market
4,M4N,Central Toronto,"Lawrence Park, Lawrence Park, Lawrence Park",43.72802,-79.38879,4,Bus Line,Swim School,Park,Comfort Food Restaurant,...,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store,Diner,Colombian Restaurant,Comic Shop,Event Space,Department Store


#### Visualizing clusters using folium

In [148]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(
        toronto_merged['Latitude'], 
        toronto_merged['Longitude'], 
        toronto_merged['Neighborhood'], 
        toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters