# Segmenting and Clustering Neighborhoods in Toronto

# ******** Part 1 ********

### Retrieving data from Wikipedia and putting it in a dataframe

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
Soup = BeautifulSoup(requests.get(URL).content, 'html.parser')
table_contents = []
table = Soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /', ',')).replace(')', ' ')).strip(' ')
        table_contents.append(cell)

In [3]:
df = pd.DataFrame(table_contents)
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.shape
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# ******** Part 2 ********

### Adding the coordinates to the dataset

In [4]:
df_coords = pd.read_csv(r'Geospatial_Coordinates.csv')
toronto_data = df.set_index('PostalCode').join(df_coords.set_index('Postal Code'))
toronto_data

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...
M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# ******** Part 3 ********

### Displaying the dataset as a map with markers

In [5]:
!pip install geopy
from geopy.geocoders import Nominatim
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium



In [6]:
# Retrieve coordinates for Toronto
address = 'Toronto'
geolocator = Nominatim(user_agent = "toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

# Plot a map of Toronto with markers for each neighborhood
map_toronto = folium.Map(location = [latitude, longitude], zoom_start = 10)
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = 4,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.5,
        parse_html = False).add_to(map_toronto)
map_toronto

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


### Using Foursquare to retrieve venues in the neighborhoods

In [7]:
# PLEASE REPLACE THESE STRINGS WITH YOUR FOURSQUARE'S PERSONAL ACCOUNT
CLIENT_ID = '...'
CLIENT_SECRET = '...'
VERSION = '20180605'
LIMIT = 100

In [8]:
# Function that retrieves data from a neighborhood

def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print("Processing", name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                  'Neighborhood Latitude',
                  'Neighborhood Longitude',
                  'Venue',
                  'Venue Latitude',
                  'Venue Longitude',
                  'Venue Category']
    return(nearby_venues)

In [9]:
toronto_venues = getNearbyVenues(names = toronto_data['Neighborhood'],
                                   latitudes = toronto_data['Latitude'],
                                   longitudes = toronto_data['Longitude']
                                  )
print('\n\nThere are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

Processing Parkwoods
Processing Victoria Village
Processing Regent Park, Harbourfront
Processing Lawrence Manor, Lawrence Heights
Processing Ontario Provincial Government
Processing Islington Avenue
Processing Malvern, Rouge
Processing Don Mills North
Processing Parkview Hill, Woodbine Gardens
Processing Garden District, Ryerson
Processing Glencairn
Processing West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Processing Rouge Hill, Port Union, Highland Creek
Processing Don Mills South
Processing Woodbine Heights
Processing St. James Town
Processing Humewood-Cedarvale
Processing Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Processing Guildwood, Morningside, West Hill
Processing The Beaches
Processing Berczy Park
Processing Caledonia-Fairbanks
Processing Woburn
Processing Leaside
Processing Central Bay Street
Processing Christie
Processing Cedarbrae
Processing Hillcrest Village
Processing Bathurst Manor, Wilson Heights, Downsview North
Processing Tho

### Encoding and grouping the neighborhoods

In [10]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix = "", prefix_sep = "")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.shape

(2119, 276)

In [11]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(100, 276)

In [12]:
# Function to retrieve the most common venues given a certain group
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [13]:
num_top_venues = 5
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Lounge,Skating Rink,Breakfast Spot,Latin American Restaurant,Falafel Restaurant
1,"Alderwood, Long Branch",Pizza Place,Playground,Skating Rink,Gym,Coffee Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Mobile Phone Shop,Sushi Restaurant,Intersection
3,Bayview Village,Café,Chinese Restaurant,Japanese Restaurant,Bank,Discount Store
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Restaurant,Coffee Shop,Pizza Place
5,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Beer Bar,Farmers Market
6,"Birch Cliff, Cliffside West",Café,General Entertainment,Skating Rink,College Stadium,Concert Hall
7,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Bakery,Breakfast Spot,Grocery Store
8,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Terminal,Airport Lounge,Harbor / Marina,Coffee Shop,Plane
9,Caledonia-Fairbanks,Park,Women's Store,Dumpling Restaurant,Discount Store,Distribution Center


### Clustering

In [14]:
kclusters = 5 # Number of predefined clusters
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)

In [15]:
# Cluster labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [16]:
# Merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = pd.merge(toronto_data, neighborhoods_venues_sorted, on = 'Neighborhood')

In [17]:
# Create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 10)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html = True)
    print(type(cluster))
    folium.CircleMarker(
        [lat, lon],
        radius = 4,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.5).add_to(map_clusters)
map_clusters

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class

In [18]:
print("Cluster 1")
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Cluster 1


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
11,"Rouge Hill, Port Union, Highland Creek",Bar,Women's Store,Distribution Center,Dog Run,Doner Restaurant


In [19]:
print("Cluster 2")
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Cluster 2


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Parkwoods,Food & Drink Shop,Park,Fast Food Restaurant,Falafel Restaurant,Event Space
1,Victoria Village,Hockey Arena,Portuguese Restaurant,Intersection,Financial or Legal Service,Coffee Shop
2,"Regent Park, Harbourfront",Coffee Shop,Park,Bakery,Breakfast Spot,Café
3,"Lawrence Manor, Lawrence Heights",Clothing Store,Accessories Store,Furniture / Home Store,Boutique,Vietnamese Restaurant
4,Ontario Provincial Government,Coffee Shop,Sushi Restaurant,Yoga Studio,Persian Restaurant,Beer Bar
5,"Malvern, Rouge",Fast Food Restaurant,Dumpling Restaurant,Discount Store,Distribution Center,Dog Run
6,Don Mills North,Gym,Caribbean Restaurant,Japanese Restaurant,Café,Women's Store
7,"Parkview Hill, Woodbine Gardens",Pizza Place,Gym / Fitness Center,Gastropub,Intersection,Breakfast Spot
8,"Garden District, Ryerson",Coffee Shop,Clothing Store,Hotel,Bubble Tea Shop,Italian Restaurant
9,Glencairn,Park,Bakery,Sushi Restaurant,Japanese Restaurant,Ethiopian Restaurant


In [20]:
print("Cluster 3")
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Cluster 3


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
51,Downsview Central,Baseball Field,Food Truck,Business Service,Event Space,Ethiopian Restaurant
55,"Humberlea, Emery",Baseball Field,Eastern European Restaurant,Distribution Center,Dog Run,Doner Restaurant
98,"Old Mill South, King's Mill Park, Sunnylea, Hu...",Baseball Field,Business Service,Farmers Market,Falafel Restaurant,Event Space


In [21]:
print("Cluster 4")
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Cluster 4


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
95,"The Kingsway, Montgomery Road, Old Mill North",River,Drugstore,Diner,Discount Store,Distribution Center


In [22]:
print("Cluster 5")
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Cluster 5


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
20,Caledonia-Fairbanks,Park,Women's Store,Dumpling Restaurant,Discount Store,Distribution Center
62,Weston,Park,Women's Store,Dumpling Restaurant,Discount Store,Distribution Center
64,York Mills West,Park,Convenience Store,Women's Store,Dumpling Restaurant,Discount Store
89,Rosedale,Park,Playground,Trail,Donut Shop,Diner
