# COURSERA Applied Data Science Capstone

## Week 3: Segmenting and Clustering Neighborhoods in Toronto

### Part 2

Import all the required libraries

In [1]:
# Added for Part 1 but used in Parts 2 & 3
 
try:
    from bs4 import BeautifulSoup
except ImportError:
    !conda install -c conda-forge beautifulsoup4 --yes # Only needed one time
    from bs4 import BeautifulSoup
    
#!conda install -c conda-forge beautifulsoup4 --yes # Only needed one time
#!conda install -c conda-forge geocoder --yes # Only needed one time
#from bs4 import BeautifulSoup

import pandas as pd
import requests


# Nothing addional for Part 2

# Added for Part 3
import numpy as np # library to handle data in a vectorized manner

try:
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
except ImportError:
    !conda install -c conda-forge geopy --yes 
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
    
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

try:
    import folium # map rendering library
except ImportError:
    !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
    import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Consolidated code from Part 1

In [2]:
#Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M , in order to obtain the data that is in the table of postal codes  
#Parse using BeautifulSoup
wikiPage = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rawWikiPage = requests.get(wikiPage).text
parseWiki = BeautifulSoup(rawWikiPage, 'html.parser')

print('Wiki page scraped and parsed')

#Extract the table and put into Pandas dataframe  
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
postalCodesTable = []
for tr in parseWiki.tbody.find_all('tr'):
    postalCodesTable.append([ td.get_text().strip() for td in tr.find_all('td')])
rawWikiDF = pd.DataFrame(postalCodesTable, columns=['PostalCode', 'Borough', 'Neighborhood'])

#Keep only the cells that have an assigned borough. Drop records that are all "None" and ignore cells with a borough that is "Not assigned".
workingWikiDF = rawWikiDF.dropna()
workingWikiDF = workingWikiDF[workingWikiDF.Borough != "Not assigned"].reset_index(drop=True)


#Combine rows of neighborhoods that have the same postal code area.
workingWikiDF_groupNeighborhoods = workingWikiDF.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

#Checking the value of postal code M7A for next step
#print(workingWikiDF_groupNeighborhoods[workingWikiDF_groupNeighborhoods['PostalCode']=='M7A'])

#Assign the borough name as the neighborhood for all neighborhoods with "Not assigned" as neighborhood name.
for index, row in workingWikiDF_groupNeighborhoods.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]       
wikiDF = workingWikiDF_groupNeighborhoods

#Checking the value of postal code M7A to ensure last step worked
#print(wikiDF[wikiDF['PostalCode']=='M7A'])

# Size of the Data Frame
print('The size of the wikiDF data frame is:')
wikiDF.shape

Wiki page scraped and parsed
The size of the wikiDF data frame is:


(103, 3)

The Geocoder package was inconsistent so I chose to add Latitude and Longitude to the dataframe using the data in the .csv provided at  
http://cocl.us/Geospatial_data

### Consolidated code from Part 2

In [3]:
#Load the csv into a dataframe, and change the name of the 'Postal Code' column to 'PostalCode' to match wikiDF
geoCoord = pd.read_csv("Geospatial_Coordinates.csv")
# change the name of the 'Postal Code' column to 'PostalCode' to match wikiDF
geoCoord.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
# Preview the first 5 lines of the loaded data 
#geoCoord.head()

#Join dataframes wikiDF and geoCoord to add the Lat and Long to the data set.
geoWikiDF = pd.merge(wikiDF, geoCoord, on='PostalCode', how='left')
geoWikiDF

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


### Code for Part 3 Starts here

As suggested, data set is reduced to those Boroughs with Toronto in the name

In [4]:
torontoDF = geoWikiDF[geoWikiDF.Borough.str.contains('Toronto')].reset_index(drop=True)
torontoDF

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent t_explorer, as shown below.

In [5]:
address = 'Toronto'

geolocator = Nominatim(user_agent="t_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [6]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontoDF['Latitude'], torontoDF['Longitude'], torontoDF['Borough'], torontoDF['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood(s) and its respective borough.

Define Foursquare Credentials and Version  
This will be removed when posted to GitHub

In [7]:
CLIENT_ID = '<ENTER CLIENT ID>' # your Foursquare ID
CLIENT_SECRET = '<ENTER CLIENT SECRET>' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OPWEYYLWUW5A41QWB0F5ZC1KWZRY0RCYDOTIB1EYGRKF1MYR
CLIENT_SECRET:JZ2EGPAG2CCGTXZB3KIMZ2MJTUOKSMDBVIOLTVLDQXWNUUBL


#### Let's explore the first neighborhood in our dataframe.  
Get the neighborhood's name.

In [8]:
torontoDF.loc[0, 'Neighborhood']

'The Beaches'

Get the neighborhood's latitude and longitude values.

In [9]:
neighborhood_latitude = torontoDF.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = torontoDF.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = torontoDF.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


#### Now, let's get the top 100 venues that are in The Beaches within a radius of 500 meters.  
First, let's create the GET request URL.

In [10]:
# Limit of 100 venues and radius of 500 meters
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=OPWEYYLWUW5A41QWB0F5ZC1KWZRY0RCYDOTIB1EYGRKF1MYR&client_secret=JZ2EGPAG2CCGTXZB3KIMZ2MJTUOKSMDBVIOLTVLDQXWNUUBL&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

Send the GET request and examine the results

In [11]:
results = requests.get(url).json()
#results  #uncomment to see the results

Function that extracts the category of the venue

In [12]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [13]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Glen Stewart Ravine,Other Great Outdoors,43.6763,-79.294784
4,Upper Beaches,Neighborhood,43.680563,-79.292869


In [14]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


## Explore Neighborhoods in Toronto

#### A function to repeat the same process to all the neighborhoods in the selected Boroughs of Toronto

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighborhood and create a new dataframe called *toronto_venues*.

In [16]:
toronto_venues = getNearbyVenues(names=torontoDF['Neighborhood'],
                                   latitudes=torontoDF['Latitude'],
                                   longitudes=torontoDF['Longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


Let's check how many venues were returned for each neighborhood

In [17]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,57,57,57,57,57,57
"Brockton, Exhibition Place, Parkdale Village",24,24,24,24,24,24
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town",41,41,41,41,41,41
Central Bay Street,83,83,83,83,83,83
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,15,15,15,15,15,15
Church and Wellesley,88,88,88,88,88,88


#### The number of unique categories that can be curated from all the returned venues

In [18]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 235 uniques categories.


## Analyze Each Neighborhood

Perform one hot encoding in order to simplify analysis

In [19]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

toronto_onehot

Unnamed: 0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1703,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1704,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group rows by neighborhood and take the mean of the frequency of occurrence of each category

In [20]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.071429,0.071429,0.071429,0.071429,0.142857,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,...,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.012048
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.01,0.0,0.0,0.06,0.0,0.04,0.01,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011364,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011364,0.011364,0.0,0.011364,0.011364


#### Print each neighborhood along with the top 5 most common venues

In [21]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.08
1                 Café  0.05
2           Steakhouse  0.04
3                  Bar  0.04
4  American Restaurant  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2  Seafood Restaurant  0.04
3         Cheese Shop  0.04
4          Steakhouse  0.04


----Brockton, Exhibition Place, Parkdale Village----
                   venue  freq
0            Coffee Shop  0.08
1         Breakfast Spot  0.08
2                   Café  0.08
3  Performing Arts Venue  0.08
4            Yoga Studio  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0         Yoga Studio  0.06
1  Light Rail Station  0.06
2                Park  0.06
3          Comic Shop  0.06
4          Restaurant  0.06


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                v

#### Put that into a *pandas* dataframe

A function to sort the venues in descending order.

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each neighborhood.

In [23]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant,Bar,Asian Restaurant,Restaurant,Hotel,Burger Joint
1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Café,Farmers Market,Beer Bar,Steakhouse,Seafood Restaurant,Italian Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Breakfast Spot,Performing Arts Venue,Café,Climbing Gym,Stadium,Burrito Place,Sandwich Place,Restaurant,Caribbean Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Spa,Garden Center,Garden,Light Rail Station,Fast Food Restaurant,Farmers Market,Comic Shop,Park,Pizza Place
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Terminal,Airport Service,Harbor / Marina,Coffee Shop,Sculpture Garden,Boat or Ferry,Bar,Boutique,Airport Food Court,Airport Gate
5,"Cabbagetown, St. James Town",Coffee Shop,Pizza Place,Restaurant,Café,Pub,Italian Restaurant,Bakery,Bank,Market,Diner
6,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Ice Cream Shop,Sandwich Place,Burger Joint,Fried Chicken Joint,Bar,Gym / Fitness Center,Bubble Tea Shop
7,"Chinatown, Grange Park, Kensington Market",Café,Vegetarian / Vegan Restaurant,Chinese Restaurant,Bar,Vietnamese Restaurant,Bakery,Mexican Restaurant,Dumpling Restaurant,Coffee Shop,Farmers Market
8,Christie,Grocery Store,Café,Park,Convenience Store,Nightclub,Italian Restaurant,Baby Store,Restaurant,Diner,Coffee Shop
9,Church and Wellesley,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Gay Bar,Gym,Gastropub,Mediterranean Restaurant,Men's Store,Hotel


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 4 clusters.  
4 was chosen because we narrowed the boroughs down to the 4 that had Toronto in their names and I want to see if the Neighborhoods cluster similar to their boroughs

In [24]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [25]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = torontoDF

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Other Great Outdoors,Trail,Pub,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Pizza Place,Bookstore,Brewery,Bubble Tea Shop
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Park,Sandwich Place,Pet Store,Ice Cream Shop,Burger Joint,Liquor Store,Burrito Place,Fast Food Restaurant,Fish & Chips Shop,Steakhouse
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Stationery Store,Fish Market,Bookstore,Brewery,Seafood Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Lawyer,Bus Line,Swim School,Yoga Studio,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Pizza Place,Convenience Store,Sandwich Place,Food & Drink Shop,Clothing Store,Hotel,Park,Breakfast Spot,Gym,Event Space
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Clothing Store,Sporting Goods Shop,Coffee Shop,Yoga Studio,Dessert Shop,Spa,Burger Joint,Salon / Barbershop,Mexican Restaurant,Café
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Sandwich Place,Dessert Shop,Pizza Place,Gym,Coffee Shop,Café,Italian Restaurant,Sushi Restaurant,Gourmet Shop,Fried Chicken Joint
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1,Playground,Gym,Summer Camp,Yoga Studio,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,0,Coffee Shop,Pub,Light Rail Station,Sports Bar,Supermarket,Sushi Restaurant,Bagel Shop,Liquor Store,Fried Chicken Joint,Restaurant


Visualize the resulting clusters

In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Examination of each cluster to determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, a name can be assigned to each cluster. 

#### Cluster 1

In [27]:
cluster1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster1

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Health Food Store,Other Great Outdoors,Trail,Pub,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,East Toronto,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Pizza Place,Bookstore,Brewery,Bubble Tea Shop
2,East Toronto,0,Park,Sandwich Place,Pet Store,Ice Cream Shop,Burger Joint,Liquor Store,Burrito Place,Fast Food Restaurant,Fish & Chips Shop,Steakhouse
3,East Toronto,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Stationery Store,Fish Market,Bookstore,Brewery,Seafood Restaurant
4,Central Toronto,0,Park,Lawyer,Bus Line,Swim School,Yoga Studio,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
5,Central Toronto,0,Pizza Place,Convenience Store,Sandwich Place,Food & Drink Shop,Clothing Store,Hotel,Park,Breakfast Spot,Gym,Event Space
6,Central Toronto,0,Clothing Store,Sporting Goods Shop,Coffee Shop,Yoga Studio,Dessert Shop,Spa,Burger Joint,Salon / Barbershop,Mexican Restaurant,Café
7,Central Toronto,0,Sandwich Place,Dessert Shop,Pizza Place,Gym,Coffee Shop,Café,Italian Restaurant,Sushi Restaurant,Gourmet Shop,Fried Chicken Joint
9,Central Toronto,0,Coffee Shop,Pub,Light Rail Station,Sports Bar,Supermarket,Sushi Restaurant,Bagel Shop,Liquor Store,Fried Chicken Joint,Restaurant
11,Downtown Toronto,0,Coffee Shop,Pizza Place,Restaurant,Café,Pub,Italian Restaurant,Bakery,Bank,Market,Diner


In [28]:
cluster1_Top = cluster1.groupby(['1st Most Common Venue'])['Borough'].agg({"code_count": len}).sort_values("code_count", ascending=False).head(5).reset_index()
cluster1_Top

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,1st Most Common Venue,code_count
0,Coffee Shop,14
1,Café,5
2,Park,3
3,Sandwich Place,2
4,Airport Terminal,1


In [29]:
name = cluster1_Top['1st Most Common Venue'][0]
print('The name of Cluster 1 is " %s "' % (name))

The name of Cluster 1 is " Coffee Shop "


#### Cluster 2

In [30]:
cluster2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster2

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,1,Playground,Gym,Summer Camp,Yoga Studio,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


In [31]:
cluster2_Top = cluster2.groupby(['1st Most Common Venue'])['Borough'].agg({"code_count": len}).sort_values("code_count", ascending=False).head(5).reset_index()
cluster2_Top

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,1st Most Common Venue,code_count
0,Playground,1


In [32]:
name = cluster2_Top['1st Most Common Venue'][0]
print('The name of Cluster 2 is " %s "' % (name))

The name of Cluster 2 is " Playground "


#### Cluster 3

In [33]:
cluster3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster3

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,2,Garden,Yoga Studio,Discount Store,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


In [34]:
cluster3_Top = cluster3.groupby(['1st Most Common Venue'])['Borough'].agg({"code_count": len}).sort_values("code_count", ascending=False).head(5).reset_index()
cluster3_Top

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,1st Most Common Venue,code_count
0,Garden,1


In [35]:
name = cluster3_Top['1st Most Common Venue'][0]
print('The name of Cluster 3 is " %s "' % (name))

The name of Cluster 3 is " Garden "


#### Cluster 4

In [36]:
cluster4 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
cluster4

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,3,Park,Playground,Trail,Building,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
23,Central Toronto,3,Park,Trail,Jewelry Store,Sushi Restaurant,Yoga Studio,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


In [37]:
cluster4_Top = cluster4.groupby(['1st Most Common Venue'])['Borough'].agg({"code_count": len}).sort_values("code_count", ascending=False).head(5).reset_index()
cluster4_Top

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,1st Most Common Venue,code_count
0,Park,2


In [38]:
name = cluster4_Top['1st Most Common Venue'][0]
print('The name of Cluster 4 is " %s "' % (name))

The name of Cluster 4 is " Park "
