# SEGMENTING AND CLUSTERING NEIGHBORHOODS IN TORONTO

In [1]:
import numpy as np
import pandas as pd

from pandas.io.json import json_normalize 
import json

from geopy.geocoders import Nominatim 
import requests 

import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

print("Libraries  have been imported")

Libraries  have been imported


### one has to first install the module "folium" onto the machine before importing it.

In [2]:
!pip install folium

Requirement not upgraded as not directly required: folium in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: jinja2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: branca in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests->folium)
Requirement not upgraded as not directly required: idna<2.7,>=2.5 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests->folium)
Requirement not upgraded as not directly require

### above line is required only in the first run of the code

In [3]:
import folium

print('Folium imported')

Folium imported


In [4]:
from bs4 import BeautifulSoup

print('BeautifulSoup imported')

BeautifulSoup imported


### LET'S OBTAIN DATA FROM  WIKIPEDIA PAGE

In [5]:
toronto_postcodes_wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

r = requests.get(toronto_postcodes_wiki_url)
page = r.text

soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table')

In [6]:
def parse_postcodes_table(table, verbose=False):
    data = []
    for td in table.find_all('td'): # Find all <td> elements in our table
        is_gray = 'color:#ccc' in td.get('style') # Greyed out <td> cells have color set to #ccc in 'style' attribute
        postcode = td.p.b.text # Postcode is the first bolded text within the paragraph inside the <td> cell
        span = td.p.span.text # Let's extract the text contents of the first <span> element within the first paragraph within the <td> cell
        not_assigned = span == 'Not assigned' # Check if text is 'Not assigned'
        if not(is_gray or not_assigned): # We're skipping greyed out and/or not assigned cells
            lines = span.split('\n') # Let's get lines of text within the span            
            borough = lines[0] # First line is the borough
            
            borough_neighborhoods = []
            for line in lines[1:]: # Other lines are neighborhoods
                line = line.strip('() ') # Let's remove '(', ')' and potential spaces from the start and end of string
                for token in line.split(' / '): # Neighborhoods within the line are separated with ' / '
                    borough_neighborhoods.append(token)
            if len(borough_neighborhoods)==0: # If no neighborhoods are defined...
                borough_neighborhoods.append(borough) #...use the borough
                
            borough_neighborhoods = ', '.join(borough_neighborhoods) # Convert array of neighborhoods into single string with comma-separated values
            data.append([postcode, borough, borough_neighborhoods]) # Store data extracted from <td> into array to be used for dataframe
            if verbose:
                print(postcode, '/', borough, '/', borough_neighborhoods)
    
    # Create the dataframe from extracted data
    df = pd.DataFrame(data=data, columns=['PostalCode', 'Borough', 'Neighborhood'])
    temp1 =[]
    temp2=[]
    for var in df['Borough'].values:
        temp1.append((var[var.find(r'(')+1 : var.find(r')')]))
        temp2.append((var[:var.find(r'(')]))
    df["Neighborhood"] = temp1
    df["Borough"] = temp2
    
    return df

In [7]:
df = parse_postcodes_table(table)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Par,Queen's Par


In [8]:
print('Number of rows in dataframe: ', df.shape[0])

Number of rows in dataframe:  103


### Latitude and Longitude of each neighborhood

In [9]:
# The code was removed by DSX for sharing.

In [10]:
def get_postcode_coordinates(api_key, postcode, country='Canada', verbose=False):
    try:
        url = 'https://maps.googleapis.com/maps/api/geocode/json?key={}&address={},{}'.format(api_key, postcode, country)
        response = requests.get(url).json()
        if verbose:
            print('Google Maps API JSON result =>', response)
        results = response['results']
        geographical_data = results[0]['geometry']['location'] # get geographical coordinates
        lat = geographical_data['lat']
        lon = geographical_data['lng']
        return lat, lon
    except:
        return None, None

In [11]:
lat, lon = get_postcode_coordinates(API_key, 'M5G')
print('Lat:', lat, 'Lon:', lon)

Lat: 43.6579524 Lon: -79.3873826


#### Traverse over the postal codes to obtain the co-ordinates

In [12]:
postcodes = df['PostalCode'].values

latitudes = []
longitudes = []
for postcode in postcodes:
    lat, lon = get_postcode_coordinates(API_key, postcode)
    latitudes.append(lat)
    longitudes.append(lon)

In [13]:
# Use the arrays with lat/lon values to fill the columns in our dataframe
df['Latitude'] = latitudes
df['Longitude'] = longitudes

### Clustering Neighborhood data

In [14]:
df_toronto = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)

In [15]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


#### Obtain the geographical co-ordinates of Toronto an plot it on a map

In [16]:
geolocator = Nominatim()

location = geolocator.geocode('Toronto, Canada')
toronto_lat = location.latitude
toronto_lon = location.longitude
print('The geograpical coordinate of Toronto: {}, {}.'.format(toronto_lat, toronto_lon))

The geograpical coordinate of Toronto: 43.653963, -79.387207.


In [17]:
map_toronto = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=12)

for lat, lon, postcode in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['PostalCode']):
    label = folium.Popup(postcode)
    folium.CircleMarker([lat, lon], radius=7, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.5).add_to(map_toronto)  
    
map_toronto

#### Clustering the data

In [18]:
# Create dataframe with lat/lon only
df_toronto_latlon = df_toronto[['Latitude','Longitude']].copy()

# Cluster neighborhoods based on location (use 4 clusters)
kmeans = KMeans(n_clusters=4, random_state=0).fit(df_toronto_latlon)

# Add column containing cluster labels
df_toronto_latlon['Cluster'] = kmeans.labels_

In [19]:
map_toronto_latlon = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=12)

colors = ['red', 'green', 'blue', 'magenta']

for lat, lon, cluster in zip(df_toronto_latlon['Latitude'], df_toronto_latlon['Longitude'], df_toronto_latlon['Cluster']):
    color = colors[cluster]
    folium.CircleMarker([lat, lon], radius=8, color=color, fill=True, fill_color=color, fill_opacity=0.7).add_to(map_toronto_latlon)  
    
map_toronto_latlon

#### Using the Foursquare API

In [20]:
# The code was removed by DSX for sharing.

In [25]:
# Function that extracts the venue category
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue']['categories']
        
    return categories_list[0]['name'] if len(categories_list)>0 else None
    
#Function to return top venues based of location
def get_top_venues(loc_name, lat, lon, client_id, client_secret, radius=500, limit=100):
    version = '20180605'
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        client_id, client_secret, version, lat, lon, radius, limit)
    results = requests.get(url).json()['response']['groups'][0]['items']
    venues = [(loc_name,
               lat,
               lon,
               row['venue']['name'],
               row['venue']['location']['lat'],
               row['venue']['location']['lng'],
               get_category_type(row)) for row in results]
    
    return venues

In [26]:
# Let's try it on first postcode / first location in our table
loc_name = df_toronto.loc[0, 'PostalCode']
lat = df_toronto.loc[0, 'Latitude']
lon = df_toronto.loc[0, 'Longitude']
venues = get_top_venues(loc_name, lat, lon, CLIENT_ID, CLIENT_SECRET)

print('Number of venues returned for postcode {}: {}'.format(loc_name, len(venues)))
for v in venues[:5]:
    print(v)

Number of venues returned for postcode M5A: 50
('M5A', 43.6542599, -79.360635899999991, 'Roselle Desserts', 43.653446723052674, -79.3620167174383, 'Bakery')
('M5A', 43.6542599, -79.360635899999991, 'Tandem Coffee', 43.65355870959944, -79.36180945913513, 'Coffee Shop')
('M5A', 43.6542599, -79.360635899999991, 'Body Blitz Spa East', 43.65473505045365, -79.35987433132891, 'Spa')
('M5A', 43.6542599, -79.360635899999991, 'Cooper Koo YMCA', 43.65319052672638, -79.35794700053884, 'Gym / Fitness Center')
('M5A', 43.6542599, -79.360635899999991, 'Morning Glory Cafe', 43.653946942635294, -79.36114884214422, 'Breakfast Spot')


In [27]:
#Let's get venues for all postcodes / neighborhoods
loc_names = df_toronto['PostalCode']
lats = df_toronto['Latitude']
lons = df_toronto['Longitude']
all_venues = []
print('Obtaining venues in neighborhoods: ', end='')
for loc_name, lat, lon in zip(loc_names, lats, lons):
    venues = get_top_venues(loc_name, lat, lon, CLIENT_ID, CLIENT_SECRET)
    all_venues.extend(venues)
    print(loc_name, ' ', end='')
print('done.')

Obtaining venues in neighborhoods: M5A  M5B  M5C  M4E  M5E  M5G  M6G  M5H  M6H  M4J  M5J  M6J  M4K  M5K  M6K  M4L  M5L  M4M  M4N  M5N  M4P  M5P  M6P  M4R  M5R  M6R  M4S  M5S  M6S  M4T  M5T  M4V  M5V  M4W  M5W  M4X  M5X  M4Y  M7Y  done.


In [28]:
# Let's create dataframe from this data
toronto_venues = pd.DataFrame(data=all_venues, columns=['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'])

In [29]:
print('Total number of venues: ', toronto_venues.shape[0])

Total number of venues:  1699


In [30]:
# Let's check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()['Venue']

Neighborhood
M4E      3
M4J      3
M4K     42
M4L     18
M4M     40
M4N      4
M4P      9
M4R     20
M4S     36
M4T      3
M4V     13
M4W      4
M4X     43
M4Y     82
M5A     50
M5B    100
M5C    100
M5E     58
M5G     88
M5H    100
M5J    100
M5K    100
M5L    100
M5N      1
M5P      5
M5R     22
M5S     34
M5T    100
M5V     14
M5W     95
M5X    100
M6G     15
M6H     16
M6J     64
M6K     22
M6P     23
M6R     14
M6S     42
M7Y     16
Name: Venue, dtype: int64

In [31]:
# Let's find out how many unique categories can be curated from all the returned venues
print('There are {} unique venue categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 235 unique venue categories.


In [32]:
# Create one-hot-encoding of venue categories
categories_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
categories_onehot['Neighborhood'] = toronto_venues['Neighborhood']
categories_onehot.head()

Unnamed: 0,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Let's group rows by neighborhood and calculate the mean of the category occurrence frequency
categories_grouped = categories_onehot.groupby('Neighborhood').mean().reset_index()
categories_grouped.head(10)

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,...,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
3,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025
5,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05
8,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Function to sort the venues in descending order

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
num_top_venues = 10

# Create column names according to number of top venues
columns = ['Neighborhood']
for i in np.arange(num_top_venues):
    columns.append('{}{} Most Common Venue'.format(i+1, 'st' if i==0 else 'nd' if i==1 else 'rd' if i==2 else 'th'))

# Create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = categories_grouped['Neighborhood']

for i in np.arange(categories_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[i, 1:] = return_most_common_venues(categories_grouped.iloc[i, :], num_top_venues)

In [36]:
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Coffee Shop,Pub,Yoga Studio,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M4J,Convenience Store,Park,Metro Station,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
2,M4K,Greek Restaurant,Ice Cream Shop,Coffee Shop,Bookstore,Italian Restaurant,Yoga Studio,Fruit & Vegetable Store,Pizza Place,Liquor Store,Juice Bar
3,M4L,Park,Coffee Shop,Sushi Restaurant,Burrito Place,Steakhouse,Burger Joint,Sandwich Place,Fish & Chips Shop,Pub,Movie Theater
4,M4M,Café,Coffee Shop,Bakery,American Restaurant,Italian Restaurant,Yoga Studio,Restaurant,Juice Bar,Bookstore,New American Restaurant
5,M4N,Dim Sum Restaurant,Bus Line,Park,Swim School,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
6,M4P,Hotel,Gym / Fitness Center,Sandwich Place,Park,Playground,Breakfast Spot,Clothing Store,Burger Joint,Food & Drink Shop,Doner Restaurant
7,M4R,Coffee Shop,Sporting Goods Shop,Cosmetics Shop,Yoga Studio,Furniture / Home Store,Park,Spa,Mexican Restaurant,Sandwich Place,Salon / Barbershop
8,M4S,Dessert Shop,Sandwich Place,Seafood Restaurant,Pizza Place,Coffee Shop,Café,Sushi Restaurant,Italian Restaurant,Deli / Bodega,Indoor Play Area
9,M4T,Playground,Park,Tennis Court,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [37]:
# Remove the column 'Neighborhood'
categories_grouped_clustering = categories_grouped.drop('Neighborhood', 1)

# ...and cluster based on venue categories occurence frequency only (using 4 clusters)
kmeans = KMeans(n_clusters=5, init='k-means++').fit(categories_grouped_clustering)

toronto_merged = df_toronto
toronto_merged['Cluster'] = kmeans.labels_ # Add column containing cluster label
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='PostalCode')

In [38]:
toronto_merged.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636,3,Coffee Shop,Bakery,Café,Park,Mexican Restaurant,Pub,Breakfast Spot,Theater,Gym / Fitness Center,Historic Site
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Italian Restaurant,Japanese Restaurant,Tea Room,Middle Eastern Restaurant,Ramen Restaurant,Sandwich Place
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3,Coffee Shop,Café,Hotel,Restaurant,Park,Bakery,Italian Restaurant,Cosmetics Shop,Cocktail Bar,Clothing Store
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Coffee Shop,Pub,Yoga Studio,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3,Cocktail Bar,Coffee Shop,Seafood Restaurant,Farmers Market,Bakery,Steakhouse,Restaurant,Cheese Shop,Café,Beer Bar
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,2,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Burger Joint,Bar,Chinese Restaurant,Bakery,Spa
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564,3,Grocery Store,Café,Park,Baby Store,Convenience Store,Coffee Shop,Italian Restaurant,Restaurant,Diner,Nightclub
7,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650571,-79.384568,3,Coffee Shop,Café,American Restaurant,Steakhouse,Bar,Thai Restaurant,Breakfast Spot,Hotel,Burger Joint,Restaurant
8,M6H,West Toronto,Dufferin / Dovercourt Village,43.669005,-79.442259,3,Bakery,Gym / Fitness Center,Supermarket,Pharmacy,Music Venue,Discount Store,Café,Middle Eastern Restaurant,Brewery,Liquor Store
9,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106,4,Convenience Store,Park,Metro Station,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### Visualising the clusters

In [39]:
map_clusters = folium.Map(location=[toronto_lat, toronto_lon], zoom_start=12)

colors = ['yellow', 'green', 'blue', 'red', 'magenta']

for lat, lon, postcode, borough, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Cluster']):
    label = '{} / {} (cluster {})'.format(postcode, borough, cluster)
    folium.CircleMarker([lat, lon], radius=7, popup=folium.Popup(label), color=colors[cluster], fill=True, fill_color=colors[cluster], fill_opacity=0.7).add_to(map_clusters)
       
map_clusters