<h2>Segmenting and Clustering Neighborhoods in Toronto</h2>

<h3>Part 1: Preparing Data</h3>

In [1]:
#import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
#web-scraping
webpage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(webpage.content, "html.parser")
# saving to DataFrame
table = soup.find_all('table')
df_toronto = pd.read_html(str(table))[0]
df_toronto.head(7)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


<strong>Pre-processing Data</strong>
<ol>
    <li>If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.</li>
    <li>Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned</li>
    <li>More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma. We should split them to multiple rows.</li>
</ol>

In [3]:
# replace 'Not assigned to NaN'
df_toronto.replace('Not assigned', np.nan, inplace = True)
#item1: if df.Neighborhood is NaN and df.Borough is not NaN, then df.Neighborhood = df.Borough
df_toronto.Neighborhood.fillna(df_toronto.Borough, inplace = True)
#item2: drop rows with df.Neighborhood is NaN
df_toronto.dropna(inplace=True)
#item3: split combined Neighborhoods into multiple rows
#df_toronto = \
#(df.set_index(df.columns.drop('Neighborhood',1).tolist())
#   .Neighborhood.str.split('/', expand=True)
#   .stack()
#   .reset_index()
#   .rename(columns={0:'Neighborhood'})
#   .loc[:, df.columns]
#)
df_toronto['Neighborhood'].str.replace('/',',')
df_toronto.head(7)

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


In [4]:
df_toronto.shape

(103, 3)

<h3>Part 2: Add latitude and longitude for each neighborhood</h3>

In [5]:
#I've had some troubles with geocoder, so I use .csv data
df_geo = pd.read_csv('Geospatial_Coordinates.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
df_toronto.rename(columns={'Postal code':'PostalCode'}, inplace=True)
df_geo.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
toronto_data = pd.merge(df_toronto, df_geo)
print(toronto_data.shape)
toronto_data.head()

(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


<h3>Part 3: Segmentation</h3>

In [7]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library
from sklearn.cluster import KMeans # import k-means from clustering stage
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [8]:
etobicoke = toronto_data[toronto_data['Borough'] == 'Etobicoke'].reset_index(drop=True)
print(etobicoke.shape)
etobicoke.head()

(12, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
1,M9B,Etobicoke,West Deane Park / Princess Gardens / Martin Gr...,43.650943,-79.554724
2,M9C,Etobicoke,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201
3,M9P,Etobicoke,Westmount,43.696319,-79.532242
4,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724


In [9]:
address = 'Etobicoke, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Etobicoke are {}, {}.'.format(latitude, longitude))

# create map of Etobicoke using latitude and longitude values
map_etobicoke = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(etobicoke['Latitude'], etobicoke['Longitude'], etobicoke['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
map_etobicoke

The geograpical coordinate of Etobicoke are 43.671459150000004, -79.55249206611668.


In [10]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
CLIENT_ID = 'V525NYINSNTY55CWHRJQVZEVFFXIWQTLLXAJJR3P45JX4KXM' # your Foursquare ID
CLIENT_SECRET = 'CUJDXV0XIURO5AIQTKIRRHR3XF3BHHZ5NJMAEFD0Q0UDVBMW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
etobicoke_venues = getNearbyVenues(names=etobicoke['Neighborhood'],
                                   latitudes=etobicoke['Latitude'],
                                   longitudes=etobicoke['Longitude']
                                  )


Islington Avenue
West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale
Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood
Westmount
Kingsview Village / St. Phillips / Martin Grove Gardens / Richview Gardens
New Toronto / Mimico South / Humber Bay Shores
South Steeles / Silverstone / Humbergate / Jamestown / Mount Olive / Beaumond Heights / Thistletown / Albion Gardens
Alderwood / Long Branch
Northwest
The Kingsway / Montgomery Road / Old Mill North
Old Mill South / King's Mill Park / Sunnylea / Humber Bay / Mimico NE / The Queensway East / Royal York South East / Kingsway Park South East
Mimico NW / The Queensway West / South of Bloor / Kingsway Park South West / Royal York South West


In [12]:
print(etobicoke_venues.shape)
etobicoke_venues.head()

(75, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201,LCBO,43.642099,-79.576592,Liquor Store
1,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201,Starbucks,43.641312,-79.576924,Coffee Shop
2,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201,The Beer Store,43.641313,-79.576925,Beer Store
3,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201,Shoppers Drug Mart,43.641312,-79.576924,Cosmetics Shop
4,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201,Pizza Hut,43.641845,-79.576556,Pizza Place


In [13]:
print('There are {} uniques categories.'.format(len(etobicoke_venues['Venue Category'].unique())))

There are 42 uniques categories.


In [14]:
# one hot encoding
etobicoke_onehot = pd.get_dummies(etobicoke_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
etobicoke_onehot['Neighborhood'] = etobicoke_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [etobicoke_onehot.columns[-1]] + list(etobicoke_onehot.columns[:-1])
etobicoke_onehot = etobicoke_onehot[fixed_columns]
#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
etobicoke_grouped = etobicoke_onehot.groupby('Neighborhood').mean().reset_index()

num_top_venues = 5
for hood in etobicoke_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = etobicoke_grouped[etobicoke_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alderwood / Long Branch----
          venue  freq
0   Pizza Place  0.22
1           Gym  0.11
2      Pharmacy  0.11
3  Skating Rink  0.11
4          Pool  0.11


----Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood----
            venue  freq
0  Cosmetics Shop  0.12
1            Park  0.12
2      Beer Store  0.12
3     Pizza Place  0.12
4            Café  0.12


----Kingsview Village / St. Phillips / Martin Grove Gardens / Richview Gardens----
                 venue  freq
0    Mobile Phone Shop  0.25
1                 Park  0.25
2          Pizza Place  0.25
3       Sandwich Place  0.25
4  American Restaurant  0.00


----Mimico NW / The Queensway West / South of Bloor / Kingsway Park South West / Royal York South West----
               venue  freq
0        Wings Joint  0.06
1  Convenience Store  0.06
2         Kids Store  0.06
3             Bakery  0.06
4     Hardware Store  0.06


----New Toronto / Mimico South / Humber Bay Shores----
                 venue  freq
0

In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [16]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = etobicoke_grouped['Neighborhood']

for ind in np.arange(etobicoke_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(etobicoke_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Alderwood / Long Branch,Pizza Place,Pub,Sandwich Place,Coffee Shop,Pharmacy,Pool,Gym,Skating Rink,Burger Joint,Bakery
1,Eringate / Bloordale Gardens / Old Burnhamthor...,Coffee Shop,Cosmetics Shop,Beer Store,Shopping Plaza,Liquor Store,Café,Park,Pizza Place,Fried Chicken Joint,Flower Shop
2,Kingsview Village / St. Phillips / Martin Grov...,Mobile Phone Shop,Park,Sandwich Place,Pizza Place,Wings Joint,Convenience Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore
3,Mimico NW / The Queensway West / South of Bloo...,Wings Joint,Burrito Place,Gym,Hardware Store,Thrift / Vintage Store,Flower Shop,Kids Store,Fast Food Restaurant,Discount Store,Convenience Store
4,New Toronto / Mimico South / Humber Bay Shores,American Restaurant,Café,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Liquor Store,Pharmacy,Pizza Place,Restaurant,Gym


In [17]:
# set number of clusters
kclusters = 5

etobicoke_grouped_clustering = etobicoke_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(etobicoke_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 2, 4, 2, 2, 0, 3, 2, 1, 4])

In [18]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

etobicoke_merged = etobicoke

# merge to add latitude/longitude for each neighborhood
etobicoke_merged = etobicoke_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how = 'right')

etobicoke_merged.head() # check the last columns!


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M9C,Etobicoke,Eringate / Bloordale Gardens / Old Burnhamthor...,43.643515,-79.577201,2,Coffee Shop,Cosmetics Shop,Beer Store,Shopping Plaza,Liquor Store,Café,Park,Pizza Place,Fried Chicken Joint,Flower Shop
3,M9P,Etobicoke,Westmount,43.696319,-79.532242,4,Pizza Place,Intersection,Sandwich Place,Discount Store,Chinese Restaurant,Coffee Shop,Burger Joint,Burrito Place,Beer Store,Café
4,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724,4,Mobile Phone Shop,Park,Sandwich Place,Pizza Place,Wings Joint,Convenience Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore
5,M8V,Etobicoke,New Toronto / Mimico South / Humber Bay Shores,43.605647,-79.501321,2,American Restaurant,Café,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Liquor Store,Pharmacy,Pizza Place,Restaurant,Gym
6,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437,2,Grocery Store,Fried Chicken Joint,Japanese Restaurant,Fast Food Restaurant,Beer Store,Liquor Store,Sandwich Place,Discount Store,Pharmacy,Pizza Place


In [19]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(etobicoke_merged['Latitude'], etobicoke_merged['Longitude'], etobicoke_merged['Neighborhood'], etobicoke_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [20]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 0, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Etobicoke,0,Rental Car Location,Bar,Drugstore,Wings Joint,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Discount Store,Cosmetics Shop


In [21]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 1, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Etobicoke,1,Pool,Smoke Shop,Park,River,Wings Joint,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store


In [22]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 2, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Etobicoke,2,Coffee Shop,Cosmetics Shop,Beer Store,Shopping Plaza,Liquor Store,Café,Park,Pizza Place,Fried Chicken Joint,Flower Shop
5,Etobicoke,2,American Restaurant,Café,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Liquor Store,Pharmacy,Pizza Place,Restaurant,Gym
6,Etobicoke,2,Grocery Store,Fried Chicken Joint,Japanese Restaurant,Fast Food Restaurant,Beer Store,Liquor Store,Sandwich Place,Discount Store,Pharmacy,Pizza Place
11,Etobicoke,2,Wings Joint,Burrito Place,Gym,Hardware Store,Thrift / Vintage Store,Flower Shop,Kids Store,Fast Food Restaurant,Discount Store,Convenience Store


In [23]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 3, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Etobicoke,3,Baseball Field,Wings Joint,Convenience Store,Gym,Grocery Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store


In [24]:
etobicoke_merged.loc[etobicoke_merged['Cluster Labels'] == 4, etobicoke_merged.columns[[1] + list(range(5, etobicoke_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Etobicoke,4,Pizza Place,Intersection,Sandwich Place,Discount Store,Chinese Restaurant,Coffee Shop,Burger Joint,Burrito Place,Beer Store,Café
4,Etobicoke,4,Mobile Phone Shop,Park,Sandwich Place,Pizza Place,Wings Joint,Convenience Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore
7,Etobicoke,4,Pizza Place,Pub,Sandwich Place,Coffee Shop,Pharmacy,Pool,Gym,Skating Rink,Burger Joint,Bakery
