# Segmenting and Clustering Neighborhoods in Toronto

## Exercise #1

In [88]:
#!pip install html5lib lxml bs4
import numpy as np
import pandas as pd

# read table from wikipedia into a pandas dataframe
dfs = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', attrs = {'class': 'wikitable'})
df = dfs[0].copy()

The dataset contains the string 'Not assigned' for missing values. We convert these to NA. We can fill in missing Neighbourhood values with the Borough value of the same row if that's available. Then we delete the rows that still contain missing data.

In [None]:
df.replace('Not assigned', np.nan, inplace=True)
#If a neighborhood is NA, set it to the Borough of that row
df.loc[df['Neighbourhood'].isna(), 'Neighbourhood']=df['Borough']
#Delete rows with NA - at this point NA only rows with Borough==NA will be affected
df.dropna(inplace=True)

Join rows with the same Postcode: the new row shall list all Neighbourhoods with that Postcode.

In [94]:
aggregate_fns = {'Borough':'first', 'Neighbourhood': ', '.join}
df = df.groupby('Postcode', as_index=False).aggregate(aggregate_fns)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [95]:
df.shape

(103, 3)

## Exercise #2

Load latitude, longitude coordinates of postal codes:

In [98]:
coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Join the two dataframes on their common attribute, the postcode. The column has different names in the two, therefore
we need to specify both names using the `left_on` and `right_on` parameters.

In [96]:
df = pd.merge(df, coordinates, how='inner', left_on = 'Postcode', right_on = 'Postal Code')

Remove the Postal Code coming from the coordinates dataframe as it's redundant

In [103]:
df.drop(['Postal Code'], axis=1, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Exercise #3

In [184]:
import requests
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Let's start with a general map of Toronto:

In [158]:
toronto_map = folium.Map([43.69, -79.38], zoom_start=10)
toronto_map

Let's add the neighbourhood info from our dataset to the map:

In [159]:
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map) 
toronto_map

Let's pick borough 'Downtown Toronto' for further investigation:

In [132]:
downtown_data = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [147]:
len(downtown_data.Neighbourhood.unique())

19

OK. Downtown has 19 neighborhoods. Let's borrow from the labs the function for fetching nearby venues using Foursquare:

In [127]:
CLIENT_ID = 'removed' # your Foursquare ID
CLIENT_SECRET = 'removed' # your Foursquare Secret
ACCESS_TOKEN = 'removed' # Auth Access Token
VERSION = '20180604'
LIMIT = 30

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get the venues for all Downtown Toronto neighbourhoods:

In [134]:
downtown_venues = getNearbyVenues(names=downtown_data['Neighbourhood'],
                                   latitudes=downtown_data['Latitude'],
                                   longitudes=downtown_data['Longitude']
                                  )
downtown_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


Let's check how many venues were returned for each neighborhood:

In [153]:
downtown_venues.groupby('Neighborhood')['Venue Category'].count().reset_index()

Unnamed: 0,Neighborhood,Venue Category
0,"Adelaide, King, Richmond",30
1,Berczy Park,30
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",14
3,"Cabbagetown, St. James Town",30
4,Central Bay Street,30
5,"Chinatown, Grange Park, Kensington Market",30
6,Christie,17
7,Church and Wellesley,30
8,"Commerce Court, Victoria Hotel",30
9,"Design Exchange, Toronto Dominion Centre",30


Lots of venues in most neighborhoods. Note it displays 30 because we set `LIMIT` to 30 above. So it's actually >30 in most cases. Let's find out how many unique categories can be curated from all the returned venues:

In [143]:
print('There are {} unique categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 143 unique categories.


Let's count the parks in each neighborhood, and display them is descending order!

In [156]:
downtown_venues.groupby('Neighborhood')['Venue Category'].apply(lambda x: (x=='Park').sum()).reset_index(name='Count').sort_values('Count', ascending=False)

Unnamed: 0,Neighborhood,Count
12,Harbourfront,3
6,Christie,2
15,Rosedale,2
14,Queen's Park,2
13,"Harbourfront East, Toronto Islands, Union Station",2
18,Stn A PO Boxes 25 The Esplanade,1
3,"Cabbagetown, St. James Town",1
4,Central Bay Street,1
7,Church and Wellesley,1
1,Berczy Park,1


### Cluster neighborhoods

Run *k-means* to cluster the neighborhood into 6 clusters. First we'll **one-hot-encode** the `Venue Category` column.

In [160]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.071429,0.071429,0.142857,0.214286,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
kclusters = 6

downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 1, 4, 1, 5, 1, 0, 5, 2, 2, 2, 2, 5, 1, 5, 3, 5, 2, 1])

Let's add the cluster labels to our venue list.

In [181]:
#downtown_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
downtown_merged = downtown_data

# merge downtown_grouped with downtown_data to add latitude/longitude for each neighborhood
downtown_merged.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
downtown_merged = downtown_merged.join(downtown_grouped.set_index('Neighborhood'), on='Neighborhood')
downtown_merged

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Yoga Studio,Airport,Airport Food Court,Airport Lounge,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,1,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,5,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,5,0.033333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,5,0.0,0.0,0.0,0.0,...,0.066667,0.033333,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,0.0,0.0,0.0,0.0,...,0.033333,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,5,0.0,0.0,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Use the geolocator to get the coordinates of Downtown.

In [139]:
address = 'Downtown Toronto, Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Downtown Toronto are {0:.4f}, {1:.4f}.'.format(latitude, longitude))

The geographical coordinates of Downtown Toronto are 43.6542, -79.3808.


Finally, let's visualize the resulting clusters

In [188]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=14)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters