# <font color="blue">Toronto Neighbourhoods</font>

## In this notebook, we will explore clusters of Toronto's neighbourhoods

### Import Libraries

In [1]:
import pandas as pd
import requests

print('Libraries imported.')

Libraries imported.


#### Load the dataframe using pandas

In [2]:
# assign url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# create dataframe from page
df = pd.read_html(url, flavor="bs4", header=0)[0]
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor
288,M9Z,Not assigned,Not assigned


#### Clean the data

In [3]:
# remove rows where borough not assigned
df = df[df.Borough != 'Not assigned']

# merge neighbourhoods that share a postcode
df["Neighbourhood"] = df.groupby("Postcode")["Neighbourhood"].transform(lambda x: ', '.join(x))
df.drop_duplicates(inplace=True)

# if Neighbourhood is not assigned, give borough name to neighbourhood
df[(df.Neighbourhood == "Not assigned")] = df.Borough

# reset the index
df.reset_index(drop=True, inplace=True)

In [4]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,Queen's Park,Queen's Park,Queen's Park


### Display shape of data frame

In [5]:
df.shape

(103, 3)

## Get each borough's latitude and longitude

### Import csv with latitudes and longitudes

In [6]:
lat_lng_df = pd.read_csv("http://cocl.us/Geospatial_data")
lat_lng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Add latitude and longitude to dataframe

In [7]:
# we need to rename the name of postal code in one of the columns so they match
lat_lng_df.rename(columns={"Postal Code":"Postcode"}, inplace=True)

# merge dataframes
df = pd.merge(df, lat_lng_df, on="Postcode")

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242


## Explore and cluster the neighbourhoods

In [10]:
import json
import requests
#!conda install -c conda-forge geopy --yes # uncomment this line if you need to download geopy
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if youy need to download folium
import folium

### Set up Foursquare defaults

In [11]:
client_id = "JEHUFR3S515TVIJDYY4UCOOARQKZFLLXCKMOCMHGOA1TQVDF"
client_secret = "Q44OSNI3XZVMMIPEANERXXUXJK5KJJZM5KCFHRN3UH3VXMKQ"
version = "20180605"
limit = 100

### Create function to explore each neighbourhood

In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Purge data frame

Since we're using a free Foursquare lisence with limited pings, let's shorten our dataframe to only contain boroughs who have "Toronto" in their name

In [25]:
# Purge rows that don't have "Toronto" in their name
df = df[df.Borough.str.contains("Toronto")]

# reset the index
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


### Get venues

Use our getNearbyVenues and get the nearby venues for every neighborhood

In [26]:
venues_df = getNearbyVenues(df.Neighbourhood, df.Latitude, df.Longitude)
print("Got venues")

Got venues


In [31]:
venues_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


### Analyze each neighborhood

In [52]:
# create dummy values for the venue categories
toronto_onehot = pd.get_dummies(venues_df[["Venue Category"]], prefix="", prefix_sep="")

# There is a category called "Neighborhood"
# to prevent confusion, we'll change the name of this to "Neighborhood Store"
toronto_onehot.rename(columns={"Neighborhood": "Neighborhood Store"}, inplace=True)

# add neighborhood colum to dataframe
toronto_onehot.insert(loc=0, column="Neighborhood", value=venues_df.Neighborhood)

toronto_onehot.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
