# Segmenting and Clustering Neighborhoods in Toronto
This notebook is used for the Week 3 Assignment

## Import libraries

In [1]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

## Scrape basic neighborhood information

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url).content

In [3]:
soup = BeautifulSoup(result, "html.parser")
code_collections = soup.find('tbody').find_all('tr')

In [4]:
def parse_neighborhood(content):
    neighborhood = ""
    # use regular expression to extract neighborhood in the bracket
    for index, part in enumerate(re.findall("(?<=\()[^\)]+(?=\))", content)):
        if index == 0:
            neighborhood = ",".join(part.split("/"))
        else:
            neighborhood = neighborhood + "," + ",".join(part.split("/"))
    return neighborhood

In [5]:
toronto_data = pd.DataFrame(columns = ["PostalCode", "Borough", "Neighborhood"])
for raw in code_collections:
    for entry in raw.find_all('td'):
        # extract the information if the code is assigned
        if not 'Not assigned'in entry.get_text():
            content = entry.find('span')
            borough_element = content.find('br').previous_sibling
            try:
                # when the element is a hyperlink 
                borough = borough_element.get_text()
            except:
                # when the element is just plan text
                borough = borough_element
            code = entry.find_all('b')[0].get_text()
            neighborhood = parse_neighborhood(content.get_text())
            toronto_data = toronto_data.append(pd.Series({"PostalCode":code, "Borough" :borough, "Neighborhood" :neighborhood}), ignore_index = True)
        # skip if it's not assigned
        else:
            continue

In [6]:
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [7]:
toronto_data = toronto_data.sort_values(["PostalCode"], ignore_index = True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
toronto_data.shape

(103, 3)

## Added coordinate into the dataframe

Based on the csv file, the coordinates for each neighborhood can be added into the dataframe

In [9]:
!pip install wget

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9681 sha256=b251cf442f8ea93fb92330390c7985ff42db1ea40953f20fe6affe68c651af7a
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [11]:
import wget
wget.download("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv")

'Geospatial_Coordinates.csv'

In [12]:
geo_data = pd.read_csv("Geospatial_Coordinates.csv")

In [13]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
toronto_data = toronto_data.merge(geo_data, left_on = "PostalCode", right_on = "Postal Code")
toronto_data.drop(columns = ["Postal Code"], inplace = True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
toronto_data.shape

(103, 5)

## Explore the venues for each neighborhood

Use Foursquare API to explore all the neighborhoods, and get the most popular venues nearby

In [16]:
!pip install folium

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 6.4 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [17]:
import requests
import folium

In [18]:
toronto_location = [43.670, -79.3570]
toronto_map = folium.Map(location = toronto_location, zoom_start = 11)
for neighbor, latitude, longitude in zip(toronto_data['Neighborhood'], toronto_data['Latitude'], toronto_data['Longitude']):
    folium.CircleMarker(
        [latitude, longitude],
        radius = 5,
        popup = neighbor,
        color = 'blue',
        fill_color = 'blue'
    ).add_to(toronto_map)
toronto_map

In [32]:
# credentials are blanked out for review
CLIENT_ID = '' 
CLIENT_SECRET = ''
ACCESS_TOKEN = '' 
VERSION = ''
radius = 1000
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


In [20]:
nearby_df = pd.DataFrame(columns = ["PostalCode""Neighborhood", "Venue", "VenueType"])
for code, neighbor, lat, lng in zip(toronto_data.PostalCode, toronto_data.Neighborhood, toronto_data.Latitude, toronto_data.Longitude):    
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']
    for nearby in results:
        venue_name = nearby['venue']['name']
        venue_category = nearby['venue']['categories'][0]['name']
        nearby_df = nearby_df.append(pd.Series({'PostalCode': code, 'Neighborhood': neighbor, 'Venue': venue_name, 'VenueType': venue_category}), ignore_index = True)

In [21]:
nearby_onehot = pd.get_dummies(nearby_df['VenueType'], prefix="", prefix_sep="")
nearby_onehot['PostalCode'] = nearby_df['PostalCode']
nearby_onehot.head()

Unnamed: 0,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,...,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,PostalCode
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M1B


In [22]:
columns_oders = [nearby_onehot.columns[-1]] + list(nearby_onehot.columns[0:-1])
nearby_onehot = nearby_onehot[columns_oders]
nearby_onehot.head()

Unnamed: 0,PostalCode,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Aquarium,Art Gallery,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
nearby_grouped = nearby_onehot.groupby('PostalCode').mean().reset_index()
nearby_grouped

Unnamed: 0,PostalCode,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Aquarium,Art Gallery,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.055556,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
1,M1C,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
2,M1E,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
3,M1G,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
4,M1H,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,M9N,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
98,M9P,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
99,M9R,0.0,0.0,0.0,0.0,0.000000,0.0,0.076923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
100,M9V,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000


In [24]:
# special neighborhood is the neighborhood where nothing is nearby
specialNeighborhoodCode = [postal_code for postal_code in toronto_data.PostalCode.tolist() if postal_code not in nearby_grouped.PostalCode.tolist()] 
for specialCode in specialNeighborhoodCode:
    specialRow = {category:0 for category in nearby_grouped.columns[1:]}
    specialRow["PostalCode"] = specialCode
    nearby_grouped = nearby_grouped.append(pd.Series(specialRow),ignore_index = True)
nearby_grouped = nearby_grouped.sort_values(["PostalCode"],ignore_index=True)
nearby_grouped

Unnamed: 0,PostalCode,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Aquarium,Art Gallery,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.055556,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
1,M1C,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
2,M1E,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
3,M1G,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
4,M1H,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M9N,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
99,M9P,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
100,M9R,0.0,0.0,0.0,0.0,0.000000,0.0,0.076923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
101,M9V,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000


In [25]:
freq_columns = ["postal_code", "1stVenue", "2ndVenue", "3rdVenue", "4thVenue", "5thVenue"]
frequent_venue = pd.DataFrame(columns = freq_columns)
for raw in range(nearby_grouped.shape[0]):
    neighborhood_nearby = nearby_grouped.iloc[raw,1:].sort_values(ascending = False).head(5).reset_index()
    neighborhood_nearby.columns = ["Venues", "Frequency"]
    neighborhood_nearby.Frequency = neighborhood_nearby.Frequency.astype(float)
    neighborhood_nearby = neighborhood_nearby.round({'Frequency': 2})
    frequent_venue = frequent_venue.append(pd.DataFrame([[nearby_grouped.loc[raw,"PostalCode"]] + neighborhood_nearby.Venues.tolist()], columns = frequent_venue.columns), ignore_index= True)
frequent_venue = frequent_venue.sort_values(['postal_code'])
frequent_venue

Unnamed: 0,postal_code,1stVenue,2ndVenue,3rdVenue,4thVenue,5thVenue
0,M1B,Fast Food Restaurant,Trail,Hobby Shop,Bakery,Park
1,M1C,Playground,Italian Restaurant,Park,Burger Joint,Breakfast Spot
2,M1E,Pizza Place,Bank,Coffee Shop,Fast Food Restaurant,Pharmacy
3,M1G,Park,Coffee Shop,Chinese Restaurant,Indian Restaurant,Pharmacy
4,M1H,Bakery,Gas Station,Indian Restaurant,Bank,Hakka Restaurant
...,...,...,...,...,...,...
98,M9N,Pizza Place,Train Station,Breakfast Spot,Skating Rink,Sandwich Place
99,M9P,Gas Station,Pizza Place,Ice Cream Shop,Breakfast Spot,Flea Market
100,M9R,Pharmacy,Bank,Sandwich Place,Supermarket,Gas Station
101,M9V,Grocery Store,Pizza Place,Fast Food Restaurant,Auto Garage,Park


In [26]:
toronto_data = toronto_data.merge(frequent_venue, left_on = "PostalCode", right_on = "postal_code").drop('postal_code', axis = 1)
toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1stVenue,2ndVenue,3rdVenue,4thVenue,5thVenue
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353,Fast Food Restaurant,Trail,Hobby Shop,Bakery,Park
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497,Playground,Italian Restaurant,Park,Burger Joint,Breakfast Spot
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711,Pizza Place,Bank,Coffee Shop,Fast Food Restaurant,Pharmacy
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Park,Coffee Shop,Chinese Restaurant,Indian Restaurant,Pharmacy
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Bakery,Gas Station,Indian Restaurant,Bank,Hakka Restaurant
...,...,...,...,...,...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188,Pizza Place,Train Station,Breakfast Spot,Skating Rink,Sandwich Place
99,M9P,Etobicoke,Westmount,43.696319,-79.532242,Gas Station,Pizza Place,Ice Cream Shop,Breakfast Spot,Flea Market
100,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.688905,-79.554724,Pharmacy,Bank,Sandwich Place,Supermarket,Gas Station
101,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.739416,-79.588437,Grocery Store,Pizza Place,Fast Food Restaurant,Auto Garage,Park


## Cluster Analyse

Apply k-means algorithm to cluster the neighborhood based on, what kinds of venues are bearby

In [27]:
from sklearn.cluster import KMeans

In [28]:
n_cluster = 5
kmeans_model = KMeans(n_clusters = n_cluster, random_state = 0).fit(nearby_grouped.iloc[:,1:])

In [29]:
toronto_data["Cluster"] = kmeans_model.labels_

In [30]:
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1stVenue,2ndVenue,3rdVenue,4thVenue,5thVenue,Cluster
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353,Fast Food Restaurant,Trail,Hobby Shop,Bakery,Park,4
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497,Playground,Italian Restaurant,Park,Burger Joint,Breakfast Spot,0
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711,Pizza Place,Bank,Coffee Shop,Fast Food Restaurant,Pharmacy,4
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Park,Coffee Shop,Chinese Restaurant,Indian Restaurant,Pharmacy,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Bakery,Gas Station,Indian Restaurant,Bank,Hakka Restaurant,4


In [31]:
import matplotlib.pyplot as plt
import matplotlib.colors as colors
toronto_location = [43.670, -79.3570]
colors_array = plt.cm.rainbow(np.linspace(0, 1, n_cluster))
rainbow = [colors.rgb2hex(i) for i in colors_array]

toronto_map = folium.Map(location = toronto_location, zoom_start = 11)
for neighbor, latitude, longitude, label in zip(toronto_data['Neighborhood'], toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Cluster']):
    folium.CircleMarker(
        [latitude, longitude],
        radius = 5,
        popup = neighbor,
        color = rainbow[label],
        fill_color = 'blue'
    ).add_to(toronto_map)
toronto_map