# <font color="blue">Toronto Neighbourhoods</font>

## In this notebook, we will explore clusters of Toronto's neighbourhoods

### Import Libraries

In [1]:
import pandas as pd
import requests

print('Libraries imported.')

Libraries imported.


#### Load the dataframe using pandas

In [2]:
# assign url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# create dataframe from page
df = pd.read_html(url, flavor="bs4", header=0)[0]
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor
288,M9Z,Not assigned,Not assigned


#### Clean the data

In [3]:
# remove rows where borough not assigned
df = df[df.Borough != 'Not assigned']

# merge neighbourhoods that share a postcode
df["Neighbourhood"] = df.groupby("Postcode")["Neighbourhood"].transform(lambda x: ', '.join(x))
df.drop_duplicates(inplace=True)

# if Neighbourhood is not assigned, give borough name to neighbourhood
df[(df.Neighbourhood == "Not assigned")] = df.Borough

# reset the index
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,Queen's Park,Queen's Park,Queen's Park


### Display shape of data frame

In [5]:
df.shape

(103, 3)

## Get each borough's latitude and longitude

### Import csv with latitudes and longitudes

In [6]:
lat_lng_df = pd.read_csv("http://cocl.us/Geospatial_data")
lat_lng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Add latitude and longitude to dataframe

In [7]:
# we need to rename the name of postal code in one of the columns so they match
lat_lng_df.rename(columns={"Postal Code":"Postcode"}, inplace=True)

# merge dataframes
df = pd.merge(df, lat_lng_df, on="Postcode")

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242


## Explore and cluster the neighbourhoods

In [9]:
import json
import requests
#!conda install -c conda-forge geopy --yes # uncomment this line if you need to download geopy
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if youy need to download folium
import folium
import numpy as np

### Set up Foursquare defaults

In [10]:
client_id = "JEHUFR3S515TVIJDYY4UCOOARQKZFLLXCKMOCMHGOA1TQVDF"
client_secret = "Q44OSNI3XZVMMIPEANERXXUXJK5KJJZM5KCFHRN3UH3VXMKQ"
version = "20180605"
limit = 100

### Create function to explore each neighbourhood

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name) # because this method takes a while to run, this will help us see the process made during runtime
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Purge data frame

Since we're using a free Foursquare lisence with limited API calls, let's shorten our dataframe to only contain boroughs who have "Toronto" in their name

In [12]:
# Purge rows that don't have "Toronto" in their name
df = df[df.Borough.str.contains("Toronto")]

# reset the index
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


### Get venues

Use our getNearbyVenues and get the nearby venues for every neighborhood

In [23]:
venues_df = getNearbyVenues(df.Neighbourhood, df.Latitude, df.Longitude)
print('\n')
print("Got venues")

Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Brockton, Exhibition Place, Parkdale Village
The Beaches West, India Bazaar
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
Harbord, University of Toronto
Runnymede, Swansea
Moore Park, Summerhill East
Chinatown, Grange Park, Kensington Market
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown, St. James Town
Fir

### Analyze each neighborhood

First, let's create a new data frame. This will inlude dummy values for the venue categories and the neighborhood name.

In [14]:
# create dummy values for the venue categories
toronto_onehot = pd.get_dummies(venues_df[["Venue Category"]], prefix="", prefix_sep="")

# There is a category called "Neighborhood"
# to prevent confusion, we'll change the name of this to "Neighborhood Store"
toronto_onehot.rename(columns={"Neighborhood": "Neighborhood Store"}, inplace=True)

# add neighborhood colum to dataframe
toronto_onehot.insert(loc=0, column="Neighborhood", value=venues_df.Neighborhood)

toronto_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### We'll be limiting each numberhood by their top 5 venues, so let's start with grouping by neighborhood and finding the mean for each category

In [15]:
toronto_grouped = toronto_onehot.groupby("Neighborhood").mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business reply mail Processing Centre969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.076923,0.076923,0.076923,0.153846,0.153846,0.153846,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Let's print each neighborhood with their top 5 venues

In [16]:
num_top_venues = 5

for hood in toronto_grouped["Neighborhood"]:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped["Neighborhood"] == hood].T.reset_index()
    temp.columns = ["venue", "freq"]
    temp = temp.iloc[1:]
    temp["freq"] = temp["freq"].astype(float)
    temp = temp.round({"freq": 2})
    print(temp.sort_values("freq", ascending = False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2      Thai Restaurant  0.04
3  American Restaurant  0.04
4           Steakhouse  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2          Restaurant  0.05
3  Italian Restaurant  0.04
4      Farmers Market  0.04


----Brockton, Exhibition Place, Parkdale Village----
                   venue  freq
0            Coffee Shop  0.14
1         Breakfast Spot  0.10
2                   Café  0.10
3  Performing Arts Venue  0.05
4     Falafel Restaurant  0.05


----Business reply mail Processing Centre969 Eastern----
           venue  freq
0    Pizza Place  0.06
1     Restaurant  0.06
2     Smoke Shop  0.06
3            Spa  0.06
4  Burrito Place  0.06


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0    Airport Lounge  0

##### Create a method to get each neighborhood's top venues

In [17]:
def get_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Create our data table with top venues for each neighborhood

In [18]:
num_top_venues = 10

indicators = ["st", "nd", "rd"] # for printing 1st, 2nd, 3rd

# create columns according to number of top venues
columns = ["Neighborhood"]
for ind in np.arange(num_top_venues):
    try:
        columns.append("{}{} Most Common Venue".format(ind+1, indicators[ind]))
    except:
        columns.append("{}th Most Common Venue".format(ind+1))

# create new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted["Neighborhood"] = toronto_grouped["Neighborhood"]

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = get_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Bakery,Clothing Store,Asian Restaurant,Gym,Bar
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Farmers Market,Bakery,Italian Restaurant,Steakhouse,Cheese Shop,Pub,Café
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Café,Breakfast Spot,Gym,Italian Restaurant,Convenience Store,Pet Store,Grocery Store,Nightclub,Climbing Gym
3,Business reply mail Processing Centre969 Eastern,Comic Shop,Auto Workshop,Smoke Shop,Park,Light Rail Station,Spa,Farmers Market,Fast Food Restaurant,Brewery,Burrito Place
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Boutique,Sculpture Garden
5,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Café,Pizza Place,Italian Restaurant,Pub,Indian Restaurant,Park,Bakery,Beer Store
6,Central Bay Street,Coffee Shop,Italian Restaurant,Sandwich Place,Bar,Ice Cream Shop,Café,Burger Joint,Bubble Tea Shop,Chinese Restaurant,Spa
7,"Chinatown, Grange Park, Kensington Market",Café,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Bakery,Mexican Restaurant,Chinese Restaurant,Dumpling Restaurant,Grocery Store
8,Christie,Grocery Store,Café,Park,Italian Restaurant,Coffee Shop,Nightclub,Restaurant,Diner,Baby Store,Convenience Store
9,Church and Wellesley,Sushi Restaurant,Coffee Shop,Japanese Restaurant,Gay Bar,Restaurant,Burger Joint,Pub,Gastropub,Fast Food Restaurant,Men's Store


## use k-means to cluster the neighborhoods into clusters

First, we'll create new dummy data, this time only using the top venues for each neighborhood

In [19]:
k = 10

# create new data frame with dummies for the top venues
toronto_clustering = pd.get_dummies(neighborhoods_venues_sorted.drop("Neighborhood", 1))

# build k-means model
means = KMeans(n_clusters=k, random_state=0).fit(toronto_clustering)

# check cluster labels generated for each row in the dataframe
means.labels_

array([1, 5, 9, 0, 0, 6, 1, 2, 1, 2, 5, 9, 4, 9, 5, 7, 1, 4, 2, 5, 1, 4,
       0, 1, 3, 1, 4, 3, 3, 9, 5, 1, 6, 2, 6, 8, 8, 2])

#### Create new data frame with merged data

This will be helpful for our graph below.

In [20]:
toronto_merged = df.drop(["Postcode", "Borough"], axis=1)

# add clusters
toronto_merged["Cluster Labels"] = means.labels_

# before merging, we have to change the spellings of one of our dataframes to they match
toronto_merged.rename(columns={"Neighbourhood":"Neighborhood"}, inplace=True)

# merge
toronto_merged = pd.merge(toronto_merged, neighborhoods_venues_sorted, on="Neighborhood", how="outer")

toronto_merged.head(10)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Harbourfront, Regent Park",43.65426,-79.360636,1,Coffee Shop,Café,Bakery,Park,Pub,Restaurant,Breakfast Spot,Mexican Restaurant,Theater,Chocolate Shop
1,"Ryerson, Garden District",43.657162,-79.378937,5,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Italian Restaurant,Tea Room,Bubble Tea Shop,Pizza Place,Japanese Restaurant
2,St. James Town,43.651494,-79.375418,9,Coffee Shop,Café,Hotel,Restaurant,Cosmetics Shop,Japanese Restaurant,Italian Restaurant,Cocktail Bar,Bakery,Gastropub
3,The Beaches,43.676357,-79.293031,0,Coffee Shop,Neighborhood Store,Park,Pub,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
4,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Restaurant,Farmers Market,Bakery,Italian Restaurant,Steakhouse,Cheese Shop,Pub,Café
5,Central Bay Street,43.657952,-79.387383,6,Coffee Shop,Italian Restaurant,Sandwich Place,Bar,Ice Cream Shop,Café,Burger Joint,Bubble Tea Shop,Chinese Restaurant,Spa
6,Christie,43.669542,-79.422564,1,Grocery Store,Café,Park,Italian Restaurant,Coffee Shop,Nightclub,Restaurant,Diner,Baby Store,Convenience Store
7,"Adelaide, King, Richmond",43.650571,-79.384568,2,Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Bakery,Clothing Store,Asian Restaurant,Gym,Bar
8,"Dovercourt Village, Dufferin",43.669005,-79.442259,1,Supermarket,Bakery,Pharmacy,Discount Store,Bank,Bar,Café,Art Gallery,Music Venue,Pool
9,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,2,Coffee Shop,Hotel,Aquarium,Café,Pizza Place,Scenic Lookout,Restaurant,Bakery,Brewery,Italian Restaurant


## Let's graph it!

In [21]:
toronto_lat = 43.6532
toronto_long = -79.33832

# create map
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# set color scheme for clusters
x = np.arange(k)
ys = [i+x+(i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged["Latitude"], toronto_merged["Longitude"], toronto_merged["Neighborhood"], toronto_merged["Cluster Labels"]):
    label = folium.Popup(str(poi) + "Cluster " + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters