<a href="https://colab.research.google.com/github/U-Power/Coursera_Capstone/blob/main/TorontoNeighborhoods1.2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Segmenting and Clustering Neighborhoods in Toronto**

## Getting Toronto information on the web and scraping the url page

In [1]:
# import the beautiful soup package to scrape the web page
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# create a bs4 object
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(r.text, 'html.parser')
soup.prettify();

In [3]:
# from a closer look you can find the lines containing the three informations needed for each PostalCode (TAG 'p')
soup.tbody.find_all('p');

In [4]:
# extract the row string containing the 3 metrics needed
row_data = [element.text for element in soup.tbody.find_all('p')]
row_data[0:5]

['M1ANot assigned\n',
 'M2ANot assigned\n',
 'M3ANorth York(Parkwoods)\n',
 'M4ANorth York(Victoria Village)\n',
 'M5ADowntown Toronto(Regent Park / Harbourfront)\n']

In [5]:
# extract the postal codes, getting the first 3 digits of each string. Add them to a list
pc = [];
for i in range(len(row_data)):
  pc.append(row_data[i][0:3])
pc[0:5]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']

In [6]:
# extract the rest of data and then split it, adding the first element of the split to the Borough list
borough = [];
for i in range(len(row_data)):
  borough.append(row_data[i][3:].split('(')[0].split('\n')[0])
borough[0:5]


['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [7]:
# extract the rest of data and then split it, adding the second element of the split to the Neighborhoods list (add 'Not assigned', first element if there is not a second one)
neighb = [];
for i in range(len(row_data)):
  try: neighb.append(row_data[i][3:].split('(')[1].split(')')[0].strip().replace(' / ',', ')) 
  except: neighb.append(row_data[i][3:].split('(')[0].split('\n')[0])

neighb[0:5]

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Regent Park, Harbourfront']

## Create a pandas DataFrame and cleanse the data

In [8]:
# create a dictionary to store the 3 columns needed in the dataFrame
data = {}
data['PostalCode'] = pc
data['Borough'] = borough
data['Neighborhood'] = neighb

#create a DataFrame
df = pd.DataFrame(data)
df[0:5]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
# assign NaN values to 'Not assigned' rows and drop them
df.Borough.replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
# take a look at the data grouping by 'Borough' and adjust the rows labels
df.groupby('Borough').describe()

Unnamed: 0_level_0,PostalCode,PostalCode,PostalCode,PostalCode,Neighborhood,Neighborhood,Neighborhood,Neighborhood
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Borough,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Central Toronto,9,9,M5P,1,9,9,"The Annex, North Midtown, Yorkville",1
Downtown Toronto,17,17,M5T,1,17,17,"Harbourfront East, Union Station, Toronto Islands",1
Downtown TorontoStn A PO Boxes25 The Esplanade,1,1,M5W,1,1,1,Enclave of M5E,1
East Toronto,4,4,M4M,1,4,4,Studio District,1
East TorontoBusiness reply mail Processing Centre969 Eastern,1,1,M7Y,1,1,1,Enclave of M4L,1
East York,4,4,M4B,1,4,4,"Parkview Hill, Woodbine Gardens",1
East YorkEast Toronto,1,1,M4J,1,1,1,The Danforth East,1
Etobicoke,11,11,M9V,1,11,11,Westmount,1
EtobicokeNorthwest,1,1,M9W,1,1,1,"Clairville, Humberwood, Woodbine Downs, West H...",1
MississaugaCanada Post Gateway Processing Centre,1,1,M7R,1,1,1,Enclave of L4W,1


In [11]:
# adjust the label of some borough names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [12]:
print('The dimensions of the Data Frame are:', df.shape)

The dimensions of the Data Frame are: (103, 3)


### It is possible to add more rows for every Neighborhood (to make a better analysis)

In [13]:
# try to duplicate the rows which have multiple Neighborhood values in order to map them separately
new_df = pd.DataFrame(df.Neighborhood.str.split(', ').tolist(), index=df.PostalCode).stack()
new_df = new_df.reset_index([0, 'PostalCode'])
new_df.columns = ['PostalCode', 'Neighborhood']
new_df.head(10)

Unnamed: 0,PostalCode,Neighborhood
0,M3A,Parkwoods
1,M4A,Victoria Village
2,M5A,Regent Park
3,M5A,Harbourfront
4,M6A,Lawrence Manor
5,M6A,Lawrence Heights
6,M7A,Ontario Provincial Government
7,M9A,Islington Avenue
8,M1B,Malvern
9,M1B,Rouge


In [14]:
# create a separate df which links postal codes to boroughs
df_bor = df.drop('Neighborhood', axis=1)
df_bor

Unnamed: 0,PostalCode,Borough
0,M3A,North York
1,M4A,North York
2,M5A,Downtown Toronto
3,M6A,North York
4,M7A,Queen's Park
...,...,...
98,M8X,Etobicoke
99,M4Y,Downtown Toronto
100,M7Y,East Toronto Business
101,M8Y,Etobicoke


In [15]:
# merge the dataFrames to have a complete single row for each neighborhood with the borough assigned
df_adj = new_df.merge(df_bor, how='inner')
df_adj

Unnamed: 0,PostalCode,Neighborhood,Borough
0,M3A,Parkwoods,North York
1,M4A,Victoria Village,North York
2,M5A,Regent Park,Downtown Toronto
3,M5A,Harbourfront,Downtown Toronto
4,M6A,Lawrence Manor,North York
...,...,...,...
211,M8Z,Mimico NW,Etobicoke
212,M8Z,The Queensway West,Etobicoke
213,M8Z,South of Bloor,Etobicoke
214,M8Z,Kingsway Park South West,Etobicoke


# Use the geopy Python library to get Neighborhood coordinates
**In order to simplify you can drop this section and use the CSV provided instead.**
**It is suggested since getting the coordinates for every Neighborhood does not give additional informations compared with using postal codes. Also, it takes 2-3 min to run the cycle for all the Neighborhoods**

In [17]:
# import the libraries
!pip install geopy # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values



In [18]:
# get the coordinates for every neighborhood in Toronto and append them to the dataFrame (it takes 2-3 min)
latitudes = [];
longitudes = [];
 
for i in range(len(df_adj['Neighborhood'])):
  try:
    address = 'Toronto, Ontario, ' + df_adj['Neighborhood'][i]
    geolocator = Nominatim(user_agent="toronto_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    latitudes.append(latitude)
    longitudes.append(longitude)
  except:
    latitudes.append('NaN')
    longitudes.append('NaN')


df_fin = df_adj
df_fin['Latitude'] = latitudes
df_fin['Longitude'] = longitudes

In [19]:
# pay attention at the unique coordinates values we get for every Neighborhood
# it can be compared with the unique values we get from the csv provided
df_fin.describe()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude
count,216,216,216,216.0,216.0
unique,103,208,15,179.0,179.0
top,M9W,Downsview,Etobicoke,,
freq,9,4,44,10.0,10.0


In [20]:
# the NaN rows can be eliminated in order to ease the next part of the 4square API 
df_fin = df_fin[df_fin.Latitude != 'NaN']
df_fin.describe()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude
count,206,206,206,206.0,206.0
unique,98,198,11,178.0,178.0
top,M9W,Downsview,Etobicoke,43.749299,-79.629129
freq,9,4,41,4.0,4.0


In [21]:
# get the csv file, put it in a dataFrame
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv'
latlong_df = pd.read_csv(url)
latlong_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
# notice that the unique coordinates values in the CSV and in the dataframe built with the API
print('The number of neighborhood coord for the CSV: {} \nThe number of neighborhood coord fetched with the API: {} \nSo we can make a better analysis using the data fetched from the API'.format(latlong_df['Latitude'].nunique(), df_fin['Latitude'].nunique()))

The number of neighborhood coord for the CSV: 103 
The number of neighborhood coord fetched with the API: 178 
So we can make a better analysis using the data fetched from the API


# Segmenting Toronto Neighborhoods using 4square API

## Map the neighborhood with Folium library

In [26]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries seccessfully imported')

Libraries seccessfully imported


In [27]:
# get the latitude and longitude of Toronto
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [28]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

# add markers to map
for lat, lng, borough, neighborhood in zip(df_fin['Latitude'], df_fin['Longitude'], df_fin['Borough'], df_fin['Neighborhood']):
  try:
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto) 
  except Exception:
    pass

map_toronto

## Now let's get informaton on the venues nearby every Neighborhood

In [23]:
#@title Personal data

CLIENT_ID = '5JKJ2KFPVYPAREZ2AOHPCMO3WXDUSKYGR2ZAO0QO3JUQOQC0' # your Foursquare ID
CLIENT_SECRET = 'VUFHA0PCC343VCPF4ULZLUHVNRZMNJDPFNT0EWW3HVHVKDY2' # your Foursquare Secret
ACCESS_TOKEN = 'G51ZR5XS3NMIKFO53YZ4RKDVW3OO1GBG1HIZ0VEHXCE0ROND' # your FourSquare Access Token
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value


### Run the 4square API to fetch infos on the venues of each Neighborhood

In [29]:
# let's make an example
neighborhood_latitude = df_fin.loc[4, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_fin.loc[4, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_fin.loc[4, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Manor are 43.7220788, -79.4375067.


In [30]:
#@title Example API
radius = 500
#search_query = 'TopPicks' # this query reduces the venues fetched by the API so it depends case by case if can be useful
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude,ACCESS_TOKEN, VERSION, radius, LIMIT)

In [31]:
results = requests.get(url).json()
results;

In [32]:
# function to create a dataframe with the API informations fetched
# credits for the function to Alex Aklson and Polong Lin
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
toronto_venues = getNearbyVenues(names=df_fin['Neighborhood'],
                                   latitudes=df_fin['Latitude'],
                                   longitudes=df_fin['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park
Harbourfront
Lawrence Manor
Lawrence Heights
Islington Avenue
Malvern
Rouge
Don Mills
Parkview Hill
Woodbine Gardens
Garden District
Ryerson
Glencairn
West Deane Park
Princess Gardens
Martin Grove
Islington
Cloverdale
Rouge Hill
Port Union
Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate
Bloordale Gardens
Old Burnhamthorpe
Markland Wood
Guildwood
Morningside
West Hill
The Beaches
Berczy Park
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
Wilson Heights
Downsview North
Thorncliffe Park
Richmond
Adelaide
King
Dufferin
Dovercourt Village
Scarborough Village
Fairview
Henry Farm
Oriole
Northwood Park
York University
The Danforth  East
Harbourfront East
Union Station
Toronto Islands
Little Portugal
Trinity
Kennedy Park
Ionview
East Birchmount Park
Bayview Village
Downsview
The Danforth West
Riverdale
Toronto Dominion Centre
Design Exchange
Brockton
Parkdale Village
Exhibition Pla

In [34]:
# take a look at the resul set
print(toronto_venues.shape)
toronto_venues.head()

(5596, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.761124,-79.324059,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.761124,-79.324059,Tim Hortons,43.760668,-79.326368,Café
2,Parkwoods,43.761124,-79.324059,A&W,43.760643,-79.326865,Fast Food Restaurant
3,Parkwoods,43.761124,-79.324059,Food Basics,43.760549,-79.326045,Supermarket
4,Parkwoods,43.761124,-79.324059,Shoppers Drug Mart,43.760857,-79.324961,Pharmacy


### Analyse each Neighborhood

In [35]:
# Let's check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Agincourt,11,11,11,11,11,11
Agincourt North,26,26,26,26,26,26
Albion Gardens,10,10,10,10,10,10
Alderwood,8,8,8,8,8,8
...,...,...,...,...,...,...
Woodbine Heights,8,8,8,8,8,8
York Mills,17,17,17,17,17,17
York Mills West,17,17,17,17,17,17
York University,25,25,25,25,25,25


In [36]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 331 uniques categories.


In [37]:
# create dummies for the venues categories in order to run a clustering model
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(5596, 331)


Unnamed: 0,Yoga Studio,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Service,Airport Terminal,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,Auto Workshop,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Stadium,Beach,Beach Bar,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Trail,Bistro,...,Speakeasy,Sporting Goods Shop,Sports Bar,Steakhouse,Storage Facility,Strip Club,Supermarket,Supplement Shop,Sushi Restaurant,Syrian Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Theme Park Ride / Attraction,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Tree,Tunnel,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Service,Airport Terminal,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,Auto Workshop,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Stadium,Beach,Beach Bar,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Trail,...,Speakeasy,Sporting Goods Shop,Sports Bar,Steakhouse,Storage Facility,Strip Club,Supermarket,Supplement Shop,Sushi Restaurant,Syrian Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Theme Park Ride / Attraction,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Tree,Tunnel,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store
0,Adelaide,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.020000,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.010000,0.000000,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.000000,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.01,0.0,0.00,0.000000,0.0,0.0,0.01,0.0,0.00
1,Agincourt,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.090909,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.090909,0.0,0.0,0.00,0.0,0.00
2,Agincourt North,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.038462,0.076923,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.038462,0.0,0.0,0.0,...,0.0,0.038462,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.038462,0.0,0.0,0.00,0.0,0.00
3,Albion Gardens,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.000,0.0,0.1,0.0,0.0,0.0,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.100000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.000000,0.0,0.0,0.00,0.0,0.00
4,Alderwood,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.000000,0.0,0.0,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,Woodbine Heights,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.125,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.000000,0.0,0.0,0.00,0.0,0.00
193,York Mills,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.058824,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.000000,0.0,0.0,0.00,0.0,0.00
194,York Mills West,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.058824,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.000000,0.0,0.0,0.00,0.0,0.00
195,York University,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.040,0.0,0.0,0.0,0.0,0.0,0.04,0.000000,0.040000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.000000,0.0,0.0,0.00,0.0,0.00


In [40]:
# function to sort the venues in descending order
# credits for the function to Alex Aklson and Polong Lin
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [41]:
# new dataframe that includes the cluster as well as the top N venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Gym,Hotel,Japanese Restaurant,Italian Restaurant,Clothing Store,Gastropub,American Restaurant
1,Agincourt,Chinese Restaurant,Hong Kong Restaurant,Coffee Shop,Korean Restaurant,Cantonese Restaurant,Asian Restaurant,Train Station,Food Court,Peking Duck Restaurant,Vietnamese Restaurant
2,Agincourt North,Bank,Chinese Restaurant,Liquor Store,Fast Food Restaurant,Bakery,Beer Store,Sporting Goods Shop,Frozen Yogurt Shop,Fried Chicken Joint,Spa
3,Albion Gardens,Grocery Store,Pharmacy,Fast Food Restaurant,Fried Chicken Joint,Beer Store,Sandwich Place,Auto Garage,Caribbean Restaurant,Pizza Place,Cuban Restaurant
4,Alderwood,Pizza Place,Pharmacy,Sandwich Place,Playground,Coffee Shop,Pub,Gym,Eastern European Restaurant,Doctor's Office,Dog Run


## Run k-means to cluster the neighborhood into N clusters

In [42]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [43]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

array([0, 0, 0, 2, 2, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

In [44]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_fin

# merge toronto_grouped with df_fin to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,Parkwoods,North York,43.7611,-79.3241,0.0,Caribbean Restaurant,Shopping Mall,Discount Store,Chinese Restaurant,Laundry Service,Supermarket,Café,Playground,Coffee Shop,Pizza Place
1,M4A,Victoria Village,North York,43.7327,-79.3112,0.0,Thai Restaurant,Spa,Mediterranean Restaurant,Asian Restaurant,Middle Eastern Restaurant,Event Space,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
2,M5A,Regent Park,Downtown Toronto,43.6607,-79.3605,0.0,Coffee Shop,Thai Restaurant,Restaurant,Pub,Beer Store,Electronics Store,Grocery Store,Pool,Indian Restaurant,Animal Shelter
3,M5A,Harbourfront,Downtown Toronto,43.6401,-79.3801,0.0,Coffee Shop,Café,Hotel,Restaurant,Pizza Place,Italian Restaurant,Steakhouse,Chinese Restaurant,Brewery,Music Venue
4,M6A,Lawrence Manor,North York,43.7221,-79.4375,1.0,Doctor's Office,Bank,Electronics Store,Park,Kids Store,Women's Store,Event Space,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant


In [45]:
# look for NaN values in the dataframe
print('Are there NaN values in the dataframe?:', toronto_merged.isnull().values.any(), '\n')
is_NaN = toronto_merged.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = toronto_merged[row_has_NaN]

print(rows_with_NaN)

Are there NaN values in the dataframe?: True 

   PostalCode     Neighborhood  ... 9th Most Common Venue 10th Most Common Venue
16        M9B  West Deane Park  ...                   NaN                    NaN

[1 rows x 16 columns]


In [46]:
# eliminate the useless row
toronto_merged.drop([rows_with_NaN.index[0]], inplace = True)

In [47]:
# check if the cleansing is ok
print('Are there NaN values in the dataframe?:', toronto_merged.isnull().values.any(), '\n')

Are there NaN values in the dataframe?: False 



In [48]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(round(cluster-1))],
        fill=True,
        fill_color=rainbow[int(round(cluster-1))],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Look at the clusters

In [49]:
# cluster 1 
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0].head()


Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,Parkwoods,North York,43.7611,-79.3241,0.0,Caribbean Restaurant,Shopping Mall,Discount Store,Chinese Restaurant,Laundry Service,Supermarket,Café,Playground,Coffee Shop,Pizza Place
1,M4A,Victoria Village,North York,43.7327,-79.3112,0.0,Thai Restaurant,Spa,Mediterranean Restaurant,Asian Restaurant,Middle Eastern Restaurant,Event Space,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store
2,M5A,Regent Park,Downtown Toronto,43.6607,-79.3605,0.0,Coffee Shop,Thai Restaurant,Restaurant,Pub,Beer Store,Electronics Store,Grocery Store,Pool,Indian Restaurant,Animal Shelter
3,M5A,Harbourfront,Downtown Toronto,43.6401,-79.3801,0.0,Coffee Shop,Café,Hotel,Restaurant,Pizza Place,Italian Restaurant,Steakhouse,Chinese Restaurant,Brewery,Music Venue
5,M6A,Lawrence Heights,North York,43.7228,-79.4509,0.0,Clothing Store,Coffee Shop,Restaurant,Furniture / Home Store,Women's Store,Sporting Goods Shop,Men's Store,Toy / Game Store,American Restaurant,Food Court


In [50]:
# cluster 2 
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1].head()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M6A,Lawrence Manor,North York,43.7221,-79.4375,1.0,Doctor's Office,Bank,Electronics Store,Park,Kids Store,Women's Store,Event Space,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant
9,M1B,Rouge,Scarborough,43.8049,-79.1658,1.0,Park,Caribbean Restaurant,Fast Food Restaurant,Event Space,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room
22,M1C,Port Union,Scarborough,43.7755,-79.135,1.0,Park,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space
28,M9C,Eringate,Etobicoke,43.6623,-79.5765,1.0,Park,Electronics Store,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Escape Room,Ethiopian Restaurant,Event Space
30,M9C,Old Burnhamthorpe,Etobicoke,43.6394,-79.5844,1.0,Dog Run,Flower Shop,Gas Station,Park,Fish & Chips Shop,Ethiopian Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant


In [51]:
# cluster 3 
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2].head()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
91,M1M,Cliffside,Scarborough,43.7112,-79.2482,2.0,Auto Workshop,Breakfast Spot,Pizza Place,Sandwich Place,Park,Coffee Shop,Grocery Store,Pub,Escape Room,Donut Shop
93,M1M,Scarborough Village West,Scarborough,43.7469,-79.1997,2.0,Grocery Store,Pizza Place,Sandwich Place,Park,Coffee Shop,Bus Line,Women's Store,Escape Room,Donut Shop,Dumpling Restaurant
100,M6M,Del Ray,York,43.689,-79.494,2.0,Coffee Shop,Bus Line,Pizza Place,Fast Food Restaurant,American Restaurant,Tennis Court,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant
101,M6M,Mount Dennis,York,43.687,-79.4896,2.0,Coffee Shop,Furniture / Home Store,Tennis Court,Bus Line,Pizza Place,Grocery Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
106,M1N,Cliffside West,Scarborough,43.7112,-79.2482,2.0,Auto Workshop,Breakfast Spot,Pizza Place,Sandwich Place,Park,Coffee Shop,Grocery Store,Pub,Escape Room,Donut Shop


In [52]:
# cluster 4 
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3].head()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,M4B,Parkview Hill,East York,43.7063,-79.3219,3.0,Construction & Landscaping,Women's Store,Farmers Market,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space
98,M5M,Bedford Park,North York,43.7374,-79.4109,3.0,Construction & Landscaping,Women's Store,Farmers Market,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space


In [53]:
# cluster 5 
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4].head()

Unnamed: 0,PostalCode,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
153,M1V,Steeles East,Scarborough,43.8162,-79.3145,4.0,Playground,Women's Store,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room
170,M9V,South Steeles,Etobicoke,43.8162,-79.3145,4.0,Playground,Women's Store,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room
178,M1W,Steeles West,Scarborough,43.8162,-79.3145,4.0,Playground,Women's Store,Event Space,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room


In [54]:
# for a custom number of N cluster for K-Means, to have a sum up of the clusters
for i in range(len(np.unique(kmeans.labels_))):
  print(toronto_merged.loc[toronto_merged['Cluster Labels'] == i].head());

  PostalCode      Neighborhood  ... 9th Most Common Venue 10th Most Common Venue
0        M3A         Parkwoods  ...           Coffee Shop            Pizza Place
1        M4A  Victoria Village  ...   Egyptian Restaurant      Electronics Store
2        M5A       Regent Park  ...     Indian Restaurant         Animal Shelter
3        M5A      Harbourfront  ...               Brewery            Music Venue
5        M6A  Lawrence Heights  ...   American Restaurant             Food Court

[5 rows x 16 columns]
   PostalCode  ... 10th Most Common Venue
4         M6A  ...    Egyptian Restaurant
9         M1B  ...            Escape Room
22        M1C  ...            Event Space
28        M9C  ...            Event Space
30        M9C  ...    Egyptian Restaurant

[5 rows x 16 columns]
    PostalCode  ...       10th Most Common Venue
91         M1M  ...                   Donut Shop
93         M1M  ...          Dumpling Restaurant
100        M6M  ...          Egyptian Restaurant
101        M6M  ... 