# Part 1: Scraper

In [9]:
#%%
#import packages
import pandas as pd
from bs4 import BeautifulSoup
import requests

#%%
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

#%%
bs = BeautifulSoup(r.text, 'lxml')
table = bs.findAll('tbody')

#%%
#get table
t = table[0]

#%%
#build list for the table
table_cont = []
#get table head
thead = t.findAll('th')
head = []
for h in thead: head.append(h.text.replace('\n',''))
#get table content
trow = t.findAll('tr')
for i in range(len(trow)):
    r1 = trow[i].findAll('td')
    rc = []
    for element in r1:
        rc.append(element.text.replace('\n',''))
    table_cont.append(rc)
#change list to dataframe
df_table = pd.DataFrame(data = table_cont, columns = head)
#dataframe wrangling
df_table.dropna(axis=0, how = 'all', inplace = True)
df_table = df_table[df_table['Borough']!='Not assigned']
df_table.reset_index(drop = True, inplace = True)

df_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Part 2: Get coordinates

In [7]:
#Read coordinates
df_coord = pd.read_csv('https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
df_ctab = df_table.merge(df_coord,left_on='Postal Code',right_on='Postal Code',how='left',suffixes=('_l','_r'))

In [8]:
print(df_ctab.head(),'\n\n',df_ctab.shape)

  Postal Code           Borough                                Neighbourhood  \
0         M3A        North York                                    Parkwoods   
1         M4A        North York                             Victoria Village   
2         M5A  Downtown Toronto                    Regent Park, Harbourfront   
3         M6A        North York             Lawrence Manor, Lawrence Heights   
4         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   

    Latitude  Longitude  
0  43.753259 -79.329656  
1  43.725882 -79.315572  
2  43.654260 -79.360636  
3  43.718518 -79.464763  
4  43.662301 -79.389494   

 (103, 5)


# Part 3: Neighborhood analysis

In [11]:
#Get Neighborhood with borough name containing 'Toronto'
df_toronto = df_ctab[df_ctab['Borough'].str.contains('Toronto')]
df_toronto = df_toronto.reset_index()
df_toronto.head(3)

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [12]:
#Define Foursquare credential
CLIENT_ID = 'IRKXXFZ3W5AWF25ZVSUVTXCZPCCZROGG3JH024WGIY3DYJNR' # your Foursquare ID
CLIENT_SECRET = 'DAMJQDDXDTMVPPR13DPM0KX3XKMMGJK40FFP1XFMWE4Z4ACK' # your Foursquare Secret
VERSION = '20200101' # Foursquare API version

In [13]:
#Get the top 100 venues in the first Neighborhood within 500 m radius
radius = 500
LIMIT = 100
neighborhood_latitude = df_toronto['Latitude'][0]
neighborhood_longitude = df_toronto['Longitude'][0]
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=IRKXXFZ3W5AWF25ZVSUVTXCZPCCZROGG3JH024WGIY3DYJNR&client_secret=DAMJQDDXDTMVPPR13DPM0KX3XKMMGJK40FFP1XFMWE4Z4ACK&ll=43.6542599,-79.3606359&v=20200101&radius=500&limit=100'

In [14]:
#Get the json file of the 100 venues

import json

results = requests.get(url).json()

In [None]:
results

In [15]:
# tranform JSON file into a pandas dataframe

from pandas.io.json import json_normalize

venues = results['response']['venues']
nearby_venues = json_normalize(venues) # flatten JSON

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

nearby_venues['categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues = nearby_venues.loc[:,['name','categories','location.lat','location.lng']]
nearby_venues = nearby_venues[nearby_venues['categories']!='Moving Target']
nearby_venues.head()

Unnamed: 0,name,categories,location.lat,location.lng
0,Oldtown Bodega,Café,43.653966,-79.360752
1,Sackville Playground,Park,43.654656,-79.359871
2,Tandem Coffee,Coffee Shop,43.653559,-79.361809
4,Terroni Sud Forno Produzione e Spaccio,Gourmet Shop,43.653903,-79.360018
5,Cam's Auto Service,Automotive Shop,43.654195,-79.360545


In [19]:
#define function to get venue list
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [23]:
#Get Toronto neighborhood venues

toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
print(toronto_venues.shape)

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport


In [24]:
#Analyze neighborhood
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood Name'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#Group the venues by neighborhood and calculate the frequency of the each category
toronto_grouped = toronto_onehot.groupby('Neighborhood Name').mean().reset_index()
toronto_grouped = toronto_grouped.set_index('Neighborhood Name')

toronto_grouped.head()

Unnamed: 0_level_0,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
Neighborhood Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",0.0,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.015625


In [25]:
#Get the 5 most frequent venues for each neighborhood

t_trans = toronto_grouped.T

list_5l = []
for i in range(len(toronto_grouped)):
    t = t_trans.nlargest(5,t_trans.columns[i])
    list_5l.append(list(t.index))
df_5l = pd.DataFrame(list_5l,index = toronto_grouped.index,columns = ['1st','2nd','3rd','4th','5th'])

df_5l.head()

Unnamed: 0_level_0,1st,2nd,3rd,4th,5th
Neighborhood Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Berczy Park,Coffee Shop,Bakery,Beer Bar,Café,Cheese Shop
"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Bakery,Bar
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",Light Rail Station,Auto Workshop,Brewery,Burrito Place,Comic Shop
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",Airport Lounge,Airport Service,Airport Terminal,Airport,Airport Food Court
Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Japanese Restaurant


In [26]:
#Clustering of the neighborhoods

# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 2, 0, 0, 0,
       3, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0])

In [27]:
#Check the clusters
toronto_clustered = toronto_grouped
toronto_clustered['Cluster'] = kmeans.labels_
toronto_cluster = toronto_clustered.groupby('Cluster').mean()

#Get the 5 most frequent venues for each cluster

tc_trans = toronto_cluster.T

listc_5l = []
for i in range(len(toronto_cluster)):
    tc = tc_trans.nlargest(5,tc_trans.columns[i])
    listc_5l.append(list(tc.index))
dfc_5l = pd.DataFrame(listc_5l,index = toronto_cluster.index,columns = ['1st','2nd','3rd','4th','5th'])

dfc_5l

Unnamed: 0_level_0,1st,2nd,3rd,4th,5th
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Coffee Shop,Café,Restaurant,Park,Gym
1,Park,Playground,Trail,Afghan Restaurant,Airport
2,Bus Line,Park,Swim School,Afghan Restaurant,Airport
3,Trail,Pub,Sushi Restaurant,Coffee Shop,Breakfast Spot
4,Garden,Health & Beauty Service,Home Service,Afghan Restaurant,Airport


In [28]:
#Summarize the results
df_sum = df_5l
df_sum['Cluster'] = kmeans.labels_
df_ctab.columns = ['Postal Code','Borough','Neighborhood Name','Latitude','Longitude']
df_sum = df_sum.join(df_ctab.set_index('Neighborhood Name'),on = 'Neighborhood Name',how='left')

df_sum.head(3)

Unnamed: 0_level_0,1st,2nd,3rd,4th,5th,Cluster,Postal Code,Borough,Latitude,Longitude
Neighborhood Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Berczy Park,Coffee Shop,Bakery,Beer Bar,Café,Cheese Shop,0,M5E,Downtown Toronto,43.644771,-79.373306
"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Bakery,Bar,0,M6K,West Toronto,43.636847,-79.428191
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",Light Rail Station,Auto Workshop,Brewery,Burrito Place,Comic Shop,3,M7Y,East Toronto,43.662744,-79.321558


In [34]:
# create map to show the results

import folium # map rendering library

import numpy as np # library to handle data in a vectorized manner

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


latitude = neighborhood_latitude
longitude = neighborhood_longitude

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_sum['Latitude'], df_sum['Longitude'], df_sum.index, df_sum['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters