### Web Scriping 

In [228]:
# Import packages
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
import urllib.request
from bs4 import BeautifulSoup # scripping web

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import geocoder
from sklearn.cluster import KMeans

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

import matplotlib.cm as cm
import matplotlib.colors as colors
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import folium # plotting library

In [81]:
# scripping wiki page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)

In [82]:
# parse html from url 
soup = BeautifulSoup(page, 'lxml')
# find the table class
table = soup.find('table', class_='wikitable sortable')

In [84]:
# initiate empty list for store information
postal_code = []
borough = []
neighborhood = []
for row in table.findAll('tr'):
    # find content and append to the list
    cells = row.findAll('td')
    if len(cells) == 3:
        postal_code.append(cells[0].text.strip())
        borough.append(cells[1].text.strip())
        neighborhood.append(cells[2].text.strip())

In [140]:
# create empty dataframe and append data into dataframe 
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
df['PostalCode'] = postal_code
df['Borough'] = borough
df['Neighborhood'] = neighborhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [141]:
# clean dataframe, ignore cells with a borough that is Not assigned.
df = df[df.Borough != 'Not assigned']
df.Borough.value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 5
Mississauga          1
Name: Borough, dtype: int64

In [142]:
# duplicated postal code 
df.duplicated().sum()

0

In [143]:
# not assigned neighbors
df[df.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


Seems like wiki has fixed these problems by itself!

In [144]:
df = df.reset_index().drop('index', axis=1)
print(df.shape)

(103, 3)


### Coordinates matching

In [145]:
# use geocoder takes really long, use csv instead
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [146]:
df_merge = pd.merge(df, df_coord, left_on='PostalCode', right_on='Postal Code', how='left').drop('Postal Code', axis=1)

In [147]:
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Clustering Analysis

In [149]:
import getpass

In [150]:
CLIENT_ID = getpass.getpass()

········


In [151]:
CLIENT_SECRET = getpass.getpass()

········


In [153]:
VERSION = '20200601' # Foursquare API version

In [158]:
# print out all unique boroughs
df_merge.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

Some of the boroughs contains multiple zipcodes, some of the neighborhoods overlaps with each other (different postal code share same neighborhoods). Therefore, I decide to use the postal code to analyze the cluster.

In [173]:
# get top 100 vanues in M3A within a radius of 500 meters
Radius = 1000
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    df_merge.iloc[0, 3], 
    df_merge.iloc[0, 4], 
    Radius, 
    LIMIT)

In [174]:
# request foursquare API and get the results
results = requests.get(url).json()

In [175]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [176]:
# get the result into a panda dataframe
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719
1,Brookbanks Park,Park,43.751976,-79.33214
2,Tim Hortons,Café,43.760668,-79.326368
3,A&W,Fast Food Restaurant,43.760643,-79.326865
4,Bruno's valu-mart,Grocery Store,43.746143,-79.32463


We next will write a code to run through all the postal code.

In [191]:
def getNearbyVenues(postal, latitudes, longitudes, radius=1000, LIMIT=100):
    
    venues_list=[]
    for postal, lat, lng in zip(postal, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [190]:
# run through the df_merge get neighborhoods for all postal codes
# type your answer here
toronto_venues = getNearbyVenues(postal=df_merge['PostalCode'],
                                   latitudes=df_merge['Latitude'],
                                   longitudes=df_merge['Longitude']
                                  )

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


In [192]:
# check the size of the dataframe
print(toronto_venues.shape)
toronto_venues.head()

(4893, 7)


Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,M3A,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,M3A,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
4,M3A,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,Grocery Store


In [196]:
# check how many vanues by each postal code
toronto_venues.groupby('Postal Code')['Venue'].count()

Postal Code
M1B    17
M1C     5
M1E    25
M1G     9
M1H    30
       ..
M9N    19
M9P    17
M9R    16
M9V    17
M9W     2
Name: Venue, Length: 102, dtype: int64

In [197]:
# check unique categories of all venues
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 331 uniques categories.


Next we want to use onehot encoding to analyze venue categories for each postal code.

In [198]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [200]:
toronto_onehot.shape

(4893, 332)

**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.**

In [202]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333,0.0


In [203]:
toronto_grouped.shape

(102, 332)

In [214]:
# look at the top n venue types in toronto each postal code
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create top 10 venues for each postal code.

In [216]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Restaurant,Fast Food Restaurant,Trail,Coffee Shop,Supermarket,Paper / Office Supplies Store,Auto Workshop,Spa,Caribbean Restaurant,Chinese Restaurant
1,M1C,Breakfast Spot,Italian Restaurant,Playground,Burger Joint,Park,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
2,M1E,Pizza Place,Bank,Fast Food Restaurant,Coffee Shop,Fried Chicken Joint,Liquor Store,Discount Store,Supermarket,Beer Store,Electronics Store
3,M1G,Coffee Shop,Park,Mobile Phone Shop,Chinese Restaurant,Indian Restaurant,Pharmacy,Fast Food Restaurant,Farmers Market,Eastern European Restaurant,Electronics Store
4,M1H,Bakery,Coffee Shop,Gas Station,Bank,Indian Restaurant,Hakka Restaurant,Caribbean Restaurant,Bus Line,Martial Arts Dojo,Thai Restaurant


### Cluster Neighborhood analysis 

In [244]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([1, 4, 4, 4, 1, 4, 4, 4, 4, 4, 1, 1, 1, 4, 4, 4, 4, 1, 4, 3],
      dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [245]:
# add clustering labels
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,M1B,Restaurant,Fast Food Restaurant,Trail,Coffee Shop,Supermarket,Paper / Office Supplies Store,Auto Workshop,Spa,Caribbean Restaurant,Chinese Restaurant
1,4,M1C,Breakfast Spot,Italian Restaurant,Playground,Burger Joint,Park,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm
2,4,M1E,Pizza Place,Bank,Fast Food Restaurant,Coffee Shop,Fried Chicken Joint,Liquor Store,Discount Store,Supermarket,Beer Store,Electronics Store
3,4,M1G,Coffee Shop,Park,Mobile Phone Shop,Chinese Restaurant,Indian Restaurant,Pharmacy,Fast Food Restaurant,Farmers Market,Eastern European Restaurant,Electronics Store
4,1,M1H,Bakery,Coffee Shop,Gas Station,Bank,Indian Restaurant,Hakka Restaurant,Caribbean Restaurant,Bus Line,Martial Arts Dojo,Thai Restaurant


In [246]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_sum = pd.merge(df_merge, neighborhoods_venues_sorted, left_on='PostalCode', right_on='Postal Code').drop('Postal Code', axis=1)

df_sum.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,Park,Convenience Store,Pharmacy,Bus Stop,Shopping Mall,Café,Tennis Court,Chinese Restaurant,Caribbean Restaurant,Laundry Service
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Men's Store,Lounge,Golf Course,Park,Grocery Store,Café,Gym / Fitness Center,Portuguese Restaurant,Sporting Goods Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Café,Theater,Pub,Park,Diner,Italian Restaurant,Restaurant,Bakery,Breakfast Spot
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,Fast Food Restaurant,Furniture / Home Store,Coffee Shop,Clothing Store,Dessert Shop,Restaurant,Vietnamese Restaurant,Fried Chicken Joint,Sushi Restaurant,Bank
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Park,Japanese Restaurant,Ramen Restaurant,Café,Burger Joint,Clothing Store,Italian Restaurant,Bookstore


In [247]:
# get toronto lat and lon
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.6534817 -79.3839347


In [248]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_sum['Latitude'], df_sum['Longitude'], df_sum['Borough'], df_sum['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

One interesting thing is that the cluster0, 2, 3 only has one postal code. Let's check what makes it unique.

In [290]:
# check cluster 0 first
c_0 = df_sum.loc[df_sum['Cluster Labels']==0, :].PostalCode.values[0]
toronto_grouped[toronto_grouped['Postal Code'] == c_0].T[1:].sort_values(by=31, ascending=False)[:5]

Unnamed: 0,31
Vietnamese Restaurant,0.5
Food Truck,0.25
Baseball Field,0.25
Accessories Store,0.0
Nightclub,0.0


In [287]:
# check cluster 2
c_2 = df_sum.loc[df_sum['Cluster Labels']==2, :].PostalCode.values[0]
toronto_grouped[toronto_grouped['Postal Code'] == c_2].T[1:].sort_values(by=101, ascending=False)[:5]

Unnamed: 0,101
Coffee Shop,0.5
Lounge,0.5
Accessories Store,0.0
Other Great Outdoors,0.0
Organic Grocery,0.0


In [289]:
# check cluster 3
c_3 = df_sum.loc[df_sum['Cluster Labels']==3, :].PostalCode.values[0]
toronto_grouped[toronto_grouped['Postal Code'] == c_3].T[1:].sort_values(by=19, ascending=False)[:5]

Unnamed: 0,19
Park,0.75
Pool,0.25
Accessories Store,0.0
New American Restaurant,0.0
Organic Grocery,0.0


In [310]:
# check cluster 1
c_1 = df_sum.loc[df_sum['Cluster Labels']==1, :].PostalCode.values
toronto_grouped[toronto_grouped['Postal Code'].isin(c_1)].iloc[:,1:].mean().sort_values(ascending=False)[:10]

Coffee Shop            0.082615
Café                   0.045896
Restaurant             0.028735
Pizza Place            0.027611
Park                   0.025748
Italian Restaurant     0.022873
Bakery                 0.022656
Japanese Restaurant    0.018203
Grocery Store          0.018132
Sandwich Place         0.017383
dtype: float64

In [309]:
# check cluster 3
c_4 = df_sum.loc[df_sum['Cluster Labels']==4, :].PostalCode.values
toronto_grouped[toronto_grouped['Postal Code'].isin(c_4)].iloc[:,1:].mean().sort_values(ascending=False)[:10]

Park                    0.088410
Pizza Place             0.069247
Coffee Shop             0.061051
Grocery Store           0.043590
Pharmacy                0.043048
Bank                    0.035407
Convenience Store       0.033504
Chinese Restaurant      0.027027
Fast Food Restaurant    0.024118
Sandwich Place          0.023073
dtype: float64

**Conclusion:**

- We can see that the cluster 0, 2, 3 have too little information. Cluster 0 seems to have vietnam resturant, food truck and basketball field. Cluster 2 has coffee shop and lounge. Cluster 3 has pool and park. <br/>
- Cluster 1 has a lot of coffee shops, cafe, resturants and pizza/italian resturants. From the map cluster 1 are clustered in downtown Toronto. <br/>
- Cluster 4 has parks, pizza place, coffee shop, some grocery store and covenience stores. From the map, they are clustered in the boroughs surround downtown, such as North York and Scarborough.