 # Segmenting and Clustering Neighborhoods in east_york

This code will import pandas library and use .read_html() to scrape the Wikipedia page containing east_york postal codes, boroughs and neighbourhood.

In [1]:
import pandas as pd  #import pandas library
import numpy as np
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

print('Libraries imported.')
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'  #assign wikipedia page containing table to url
dfs = pd.read_html(url)
df=dfs[0]

#inspect data
print(df.shape)
df.head(4) 

Libraries imported.
(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village


The code below assigns rows where borough are assigned to df_ba.

In [2]:
df_ba = df[df['Borough']!='Not assigned'] 

The code below resets the index and inspects the dataframe.

In [3]:
df_ba.reset_index(inplace=True)
df_ba=df_ba.drop(columns=['index'])
df_ba.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


The code below prints the number of rows and columns in the dataframe.

In [4]:
df_ba.shape

(103, 3)

The code below loads the geospatial data in a dataframe and inspects the dataframe.

In [5]:
geo_coord = pd.read_csv('https://cocl.us/Geospatial_data')
print(geo_coord.head())
geo_coord.shape

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


(103, 3)

The code below merged the two dataframe on postal code and inspects the new dataframe.

In [6]:
df = pd.merge(df_ba,geo_coord, on='Postal Code',how='inner')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent tr_explorer, as shown below.

In [7]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


The code below will create a map of Toronto with neighborhoods superimposed on top.

In [8]:
# create map of east_york using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

 The code below will slice the original dataframe and create a new dataframe of only neighbourhoods with 'Toronto' in their borough name.

In [9]:
toronto_data = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent tr_explorer, as shown below.

In [10]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


The code below will create a map of Toronto with neighborhoods with 'Toronto' in their borough name superimposed on top.

In [11]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Here, we will define Foursquare credentials and versions. Insert your credentials to test code.

In [12]:
CLIENT_ID = '' # my Foursquare ID
CLIENT_SECRET = '' # my Foursquare Secret
VERSION = '' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: 
CLIENT_SECRET:


The code below returns the 5th neighborhood's name.

In [13]:
toronto_data.loc[4, 'Neighbourhood']

'The Beaches'

The code below gets the neighborhood's latitude and longitude values.

In [14]:
neighborhood_latitude = toronto_data.loc[4, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[4, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[4, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


Now, let's get the top 100 venues that are in The Beaches within a radius of 500 meters. The code below create the GET request URL.

In [15]:
LIMIT=100
radius = 500 # define radius


# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

The code below sends the GET request and examine the resutls.

In [16]:
results = requests.get(url).json()
results

{'meta': {'code': 400,
  'errorType': 'invalid_auth',
  'errorDetail': 'Missing access credentials. See https://developer.foursquare.com/docs/api/configuration/authentication for details.',
  'requestId': '602144982819f13b48816f4b'},
 'response': {}}

The code below creates a get_category_type function.

In [17]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Code below will clean the json and structure it into a pandas dataframe.

In [18]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

KeyError: 'groups'

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

The function below repeats the same process to all the neighborhoods 'Toronto' in their borough name.

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

The code below runs the above function on each neighborhood and create a new dataframe called b_with_toronto_venues.

In [None]:
b_with_toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Let us inspect the resulting dataframe.

In [None]:
print(b_with_toronto_venues.shape)
b_with_toronto_venues.head()

Let's check how many venues were returned for each neighborhood.

In [None]:
b_with_toronto_venues.groupby('Neighborhood').count()

Let's find out how many unique categories can be curated from all the returned venues.

In [None]:
print('There are {} uniques categories.'.format(len(b_with_toronto_venues['Venue Category'].unique())))

Let's analyse each neighbourhood.

In [None]:
# one hot encoding
b_with_toronto_onehot = pd.get_dummies(b_with_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
b_with_toronto_onehot['Neighbourhood'] = b_with_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [b_with_toronto_onehot.columns[-1]] + list(b_with_toronto_onehot.columns[:-1])
b_with_toronto_onehot = b_with_toronto_onehot[fixed_columns]

b_with_toronto_onehot.head()

Let's examine the dataframe.

In [None]:
b_with_toronto_onehot.shape

The code below groups rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
b_with_toronto_grouped = b_with_toronto_onehot.groupby('Neighbourhood').mean().reset_index()
b_with_toronto_grouped

Let's examine the new size.

In [None]:
b_with_toronto_grouped.shape

The code below prints each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in b_with_toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = b_with_toronto_grouped[b_with_toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

The function below sorts the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

We will create a new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = b_with_toronto_grouped['Neighbourhood']

for ind in np.arange(b_with_toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(b_with_toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

The code below runs k-means to cluster the neighborhood into 5 clusters.

In [None]:
# set number of clusters
kclusters = 5

b_with_toronto_grouped_clustering = b_with_toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(b_with_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

b_with_toronto_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
b_with_toronto_merged = b_with_toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

b_with_toronto_merged.head() # check the last columns!

Let's visualize the resulting clusters.

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(b_with_toronto_merged['Latitude'], b_with_toronto_merged['Longitude'], b_with_toronto_merged['Neighbourhood'], b_with_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The following lines of code examine each cluster.

In [None]:
b_with_toronto_merged.loc[b_with_toronto_merged['Cluster Labels'] == 0, b_with_toronto_merged.columns[[1] + list(range(5, b_with_toronto_merged.shape[1]))]]

In [None]:
b_with_toronto_merged.loc[b_with_toronto_merged['Cluster Labels'] == 1, b_with_toronto_merged.columns[[1] + list(range(5, b_with_toronto_merged.shape[1]))]]

In [None]:
b_with_toronto_merged.loc[b_with_toronto_merged['Cluster Labels'] == 2, b_with_toronto_merged.columns[[1] + list(range(5, b_with_toronto_merged.shape[1]))]]

In [None]:
b_with_toronto_merged.loc[b_with_toronto_merged['Cluster Labels'] == 3, b_with_toronto_merged.columns[[1] + list(range(5, b_with_toronto_merged.shape[1]))]]

In [None]:
b_with_toronto_merged.loc[b_with_toronto_merged['Cluster Labels'] == 4, b_with_toronto_merged.columns[[1] + list(range(5, b_with_toronto_merged.shape[1]))]]