# My take in the coursera assigment 'Segmenting and Clustering Neighboorhoods in Toronto' from the Applied Data Science Capstone course

In [1]:
import pandas as pd

## Part 1 (Recap) - Scrape the Neighbourhood data off of wikipedia and clean it up

See Parts 2 & 3 below!


### Read the data into a pandas dataframe

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Simply employ the pandas `read_html()` option to scrape the tabular data from the wikipedia page.
This method scrapes every tabular content into a list of tabular objects. We are interested in the first table on the website, e.g., the zip codes of Toronto

In [3]:
tables = pd.read_html(wiki_url)
zipcode_df=tables[0]

### Clean the data

1. Delete all rows for which the Borough is 'Not assigned'
2. Combine neighbourhoods with the same postcode
3. Replace the Neighbourhood names that are 'Not assigned' with the borough name

In [4]:
zipcode_df = zipcode_df[zipcode_df.Borough != 'Not assigned']
zipcode_df = zipcode_df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
zipcode_df.Neighborhood[zipcode_df.Neighborhood == 'Not assigned'] = zipcode_df.Borough

## Part 2 - combine the dataframe from Part 1 with geolocation data

### This part is commented out, because it does not work. So, i downloaded the .csv file with Tor

In [5]:
gsc_file = 'Geospatial_Coordinates.csv' # geospatial coordinates file
gsc_pd = pd.read_csv(gsc_file, index_col = 0) # Load the .csv file

In [6]:
zipcode_df.set_index('Postcode', inplace = True)

The dataframes can be concatenated using the pd.concat() function:

In [7]:
postcode_with_gsc_pd = pd.concat([zipcode_df, gsc_pd], axis = 1) 
postcode_with_gsc_pd.reset_index(inplace = True)
postcode_with_gsc_pd.rename(columns={"index": "Postcode"}, inplace = True)
postcode_with_gsc_pd.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3 Segmeting and Clustering

#### Use geopy library to get the latitude and longitude values of Toronto.

In [8]:
from geopy.geocoders import Nominatim
address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


 Import the dependencies that are needed.

In [9]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

import requests # library to handle requests

Restrict the data to Old Toronto

In [10]:
toronto_dt_data = postcode_with_gsc_pd[postcode_with_gsc_pd['Borough'].str.contains('Toronto')].reset_index(drop = True)
toronto_dt_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [11]:
# create map of Toronto with the latitude and longitude values (from above)
map_downtown_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_dt_data['Latitude'], toronto_dt_data['Longitude'], toronto_dt_data['Borough'], toronto_dt_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  
    
map_downtown_toronto

In [12]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20191226'


Further Foursquare request parameters:

In [13]:
radius = 500
LIMIT = 100

## 3(b) Explore Neighborhoods in downtown Toronto

Use the get nearby venues function from the Segmenting and Clustering of Neighborhoods in Manhatten:

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    i = 0
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(i, name)
        i +=1
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Let getNearbyVenues() run on every Toronto downtown neighborhood and collect the results:

In [15]:
toronto_dt_venues = getNearbyVenues(names=toronto_dt_data['Neighborhood'],
                                    latitudes=toronto_dt_data['Latitude'], 
                                    longitudes=toronto_dt_data['Longitude'])

0 The Beaches
1 The Danforth West,Riverdale
2 The Beaches West,India Bazaar
3 Studio District
4 Lawrence Park
5 Davisville North
6 North Toronto West
7 Davisville
8 Moore Park,Summerhill East
9 Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
10 Rosedale
11 Cabbagetown,St. James Town
12 Church and Wellesley
13 Harbourfront
14 Ryerson,Garden District
15 St. James Town
16 Berczy Park
17 Central Bay Street
18 Adelaide,King,Richmond
19 Harbourfront East,Toronto Islands,Union Station
20 Design Exchange,Toronto Dominion Centre
21 Commerce Court,Victoria Hotel
22 Roselawn
23 Forest Hill North,Forest Hill West
24 The Annex,North Midtown,Yorkville
25 Harbord,University of Toronto
26 Chinatown,Grange Park,Kensington Market
27 CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
28 Stn A PO Boxes 25 The Esplanade
29 First Canadian Place,Underground city
30 Christie
31 Dovercourt Village,Dufferin
32 Little Portugal,Trinity
33 Brockton,Ex

In [16]:
toronto_dt_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Domino's Pizza,43.679058,-79.297382,Pizza Place
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
5,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
6,"The Danforth West,Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop
7,"The Danforth West,Riverdale",43.679557,-79.352188,MenEssentials,43.677820,-79.351265,Cosmetics Shop
8,"The Danforth West,Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
9,"The Danforth West,Riverdale",43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant


The size of the resulting dataframe:


In [17]:
print(toronto_dt_venues.shape)

(1676, 7)


Let's check how many venues were returned for each neighborhood (just display the first 5 rows)

In [18]:
toronto_dt_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,56,56,56,56,56,56
"Brockton,Exhibition Place,Parkdale Village",24,24,24,24,24,24
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",15,15,15,15,15,15
"Cabbagetown,St. James Town",46,46,46,46,46,46
Central Bay Street,83,83,83,83,83,83
"Chinatown,Grange Park,Kensington Market",92,92,92,92,92,92
Christie,17,17,17,17,17,17
Church and Wellesley,86,86,86,86,86,86


Let's find out how many unique categories can be curated from all the returned venues

In [19]:
print('There are {} uniques categories.'.format(len(toronto_dt_venues['Venue Category'].unique())))

There are 230 uniques categories.


## 3(c) Analyze Each Neighborhood

In [20]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_dt_venues[['Venue Category']], prefix="", prefix_sep="")

In [21]:
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_dt_venues['Neighborhood'] 

In [22]:
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [23]:
toronto_onehot.shape

(1676, 230)

In [24]:
toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [25]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(38, 230)

In [26]:
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.066667,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.054348,0.01087,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.023256,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,...,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.011628


#### What are the top 5 most common venues in each neighborhood?

In [27]:
num_top_venues = 5

In [28]:
for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
             venue  freq
0      Coffee Shop  0.08
1             Café  0.05
2       Steakhouse  0.04
3  Thai Restaurant  0.03
4       Restaurant  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.09
1        Cocktail Bar  0.05
2            Beer Bar  0.04
3  Seafood Restaurant  0.04
4          Steakhouse  0.04


----Brockton,Exhibition Place,Parkdale Village----
            venue  freq
0            Café  0.12
1     Yoga Studio  0.08
2  Breakfast Spot  0.08
3     Coffee Shop  0.08
4          Bakery  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.12
1         Yoga Studio  0.06
2       Auto Workshop  0.06
3          Comic Shop  0.06
4         Pizza Place  0.06


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0    Airport Lounge  0.13
1  Airport Terminal  0.13
2             P

#### Create a new *pandas* dataframe from this data

let's (again) take the function from the lectorue notebook that sorts the venues in descending order:

In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [30]:
#Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

In [31]:
import numpy as np
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [32]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

In [33]:
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

In [34]:
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Steakhouse,Bar,Salad Place,Bakery,Burger Joint,Restaurant,Thai Restaurant,Sushi Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Café,Steakhouse,Farmers Market,Cheese Shop,Beer Bar,Bakery,Seafood Restaurant,Hotel
2,"Brockton,Exhibition Place,Parkdale Village",Café,Yoga Studio,Breakfast Spot,Coffee Shop,Gym,Pet Store,Performing Arts Venue,Music Venue,Italian Restaurant,Intersection
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Brewery,Burrito Place,Auto Workshop,Fast Food Restaurant,Farmers Market,Spa,Restaurant,Pizza Place
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Terminal,Airport Lounge,Boat or Ferry,Boutique,Rental Car Location,Plane,Coffee Shop,Harbor / Marina,Sculpture Garden,Airport Service
5,"Cabbagetown,St. James Town",Coffee Shop,Chinese Restaurant,Restaurant,Pub,Café,Italian Restaurant,Bakery,Pizza Place,Park,Breakfast Spot
6,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Ice Cream Shop,Japanese Restaurant,Burger Joint,Café,Spa,Chinese Restaurant,Middle Eastern Restaurant
7,"Chinatown,Grange Park,Kensington Market",Café,Chinese Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Bar,Bakery,Mexican Restaurant,Donut Shop
8,Christie,Grocery Store,Café,Park,Italian Restaurant,Baby Store,Athletics & Sports,Diner,Candy Store,Restaurant,Convenience Store
9,Church and Wellesley,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Yoga Studio,Pub,Gastropub,Bubble Tea Shop,Burger Joint


## 3(d) Let's start the actual clustering

Run *k*-means to cluster the neighborhood into 5 clusters.

In [35]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       4, 0, 1, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
type(kmeans.labels_)

numpy.ndarray

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [37]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_dt_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0.0,Health Food Store,Pizza Place,Pub,Trail,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Wings Joint
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0.0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Restaurant,Furniture / Home Store,Bookstore,Italian Restaurant,Pub,Pizza Place,Liquor Store
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0.0,Sushi Restaurant,Liquor Store,Fish & Chips Shop,Pub,Ice Cream Shop,Fast Food Restaurant,Burrito Place,Burger Joint,Italian Restaurant,Pizza Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0.0,Café,Coffee Shop,Gastropub,Bakery,Italian Restaurant,American Restaurant,Yoga Studio,Comfort Food Restaurant,Sandwich Place,Brewery
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4.0,Bus Line,Swim School,Park,Colombian Restaurant,College Rec Center,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0.0,Park,Clothing Store,Hotel,Sandwich Place,Convenience Store,Food & Drink Shop,Breakfast Spot,Gym,Doner Restaurant,Department Store
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0.0,Clothing Store,Coffee Shop,Sporting Goods Shop,Health & Beauty Service,Diner,Dessert Shop,Mexican Restaurant,Gym / Fitness Center,Bagel Shop,Chinese Restaurant
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,0.0,Pizza Place,Sandwich Place,Dessert Shop,Italian Restaurant,Coffee Shop,Café,Toy / Game Store,Gym,Sushi Restaurant,Indian Restaurant
8,M4T,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316,1.0,Restaurant,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049,0.0,Coffee Shop,Pub,Restaurant,Sushi Restaurant,Bagel Shop,Sports Bar,Fried Chicken Joint,Pizza Place,American Restaurant,Liquor Store


In [38]:
toronto_merged = toronto_dt_data.drop([38])

In [39]:
toronto_merged.tail()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",43.636847,-79.428191
34,M6P,West Toronto,"High Park,The Junction South",43.661608,-79.464763
35,M6R,West Toronto,"Parkdale,Roncesvalles",43.64896,-79.456325
36,M6S,West Toronto,"Runnymede,Swansea",43.651571,-79.48445
37,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558


In [40]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [41]:
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Pizza Place,Pub,Trail,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Wings Joint
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Restaurant,Furniture / Home Store,Bookstore,Italian Restaurant,Pub,Pizza Place,Liquor Store
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Sushi Restaurant,Liquor Store,Fish & Chips Shop,Pub,Ice Cream Shop,Fast Food Restaurant,Burrito Place,Burger Joint,Italian Restaurant,Pizza Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Bakery,Italian Restaurant,American Restaurant,Yoga Studio,Comfort Food Restaurant,Sandwich Place,Brewery
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Bus Line,Swim School,Park,Colombian Restaurant,College Rec Center,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [42]:
toronto_merged.tail()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
33,M6K,West Toronto,"Brockton,Exhibition Place,Parkdale Village",43.636847,-79.428191,0,Café,Yoga Studio,Breakfast Spot,Coffee Shop,Gym,Pet Store,Performing Arts Venue,Music Venue,Italian Restaurant,Intersection
34,M6P,West Toronto,"High Park,The Junction South",43.661608,-79.464763,0,Bar,Thai Restaurant,Café,Mexican Restaurant,Flea Market,Bakery,Speakeasy,Italian Restaurant,Arts & Crafts Store,Fast Food Restaurant
35,M6R,West Toronto,"Parkdale,Roncesvalles",43.64896,-79.456325,0,Gift Shop,Bookstore,Cuban Restaurant,Dessert Shop,Dog Run,Bar,Movie Theater,Italian Restaurant,Breakfast Spot,Eastern European Restaurant
36,M6S,West Toronto,"Runnymede,Swansea",43.651571,-79.48445,0,Café,Coffee Shop,Sushi Restaurant,Italian Restaurant,Pizza Place,Vegetarian / Vegan Restaurant,Gourmet Shop,Spa,Dessert Shop,Bar
37,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0,Light Rail Station,Yoga Studio,Brewery,Burrito Place,Auto Workshop,Fast Food Restaurant,Farmers Market,Spa,Restaurant,Pizza Place


Finally, let's visualize the resulting clusters

In [43]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

In [44]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [45]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 3(e) Examine the clusters

In [46]:
def analyze_cluster(cluster_data, cutoff):
    
    freq_dict = {}
    advanced_freq_dict = {}
    
    sorted_tuples = []
    sorted_advanced_tuples = []

    for col in cluster_data.columns:
        if 'Neighborhood' in col:
            continue
        else: 
            increment = ''.join(c for c in col if c.isdigit())
            increment = (10 - int(increment))/10. + 1
        uniques = cluster_data[col].unique()
        for element in uniques:
            if element not in freq_dict.keys():
                freq_dict[element] = 1
                advanced_freq_dict[element] = increment
            else:
                freq_dict[element] += 1
                advanced_freq_dict[element] += increment
            
    #print the stuff out        
    sorted_tuples = sorted(freq_dict.items(), key=lambda x:x[1], reverse = True)
    sorted_advanced_tuples = sorted(advanced_freq_dict.items(), key=lambda x:x[1], reverse = True)
    for i, tuple in enumerate(sorted_tuples):
        if tuple[1] > cutoff:
            print(f"{tuple[0]}: {tuple[1]} - adv: {sorted_advanced_tuples[i][1]:.1f}")
    

#### Cluster 1

In [47]:
cluster1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]
cluster1.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,The Beaches,Health Food Store,Pizza Place,Pub,Trail,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Wings Joint
1,"The Danforth West,Riverdale",Greek Restaurant,Coffee Shop,Ice Cream Shop,Restaurant,Furniture / Home Store,Bookstore,Italian Restaurant,Pub,Pizza Place,Liquor Store
2,"The Beaches West,India Bazaar",Sushi Restaurant,Liquor Store,Fish & Chips Shop,Pub,Ice Cream Shop,Fast Food Restaurant,Burrito Place,Burger Joint,Italian Restaurant,Pizza Place
3,Studio District,Café,Coffee Shop,Gastropub,Bakery,Italian Restaurant,American Restaurant,Yoga Studio,Comfort Food Restaurant,Sandwich Place,Brewery
5,Davisville North,Park,Clothing Store,Hotel,Sandwich Place,Convenience Store,Food & Drink Shop,Breakfast Spot,Gym,Doner Restaurant,Department Store


In [48]:
analyze_cluster(cluster1, 4)

Italian Restaurant: 8 - adv: 11.6
Bakery: 8 - adv: 11.2
Café: 7 - adv: 11.2
Pizza Place: 7 - adv: 10.8
Coffee Shop: 7 - adv: 10.3
Bar: 7 - adv: 9.9
Restaurant: 7 - adv: 9.8
Sushi Restaurant: 6 - adv: 9.1
Hotel: 6 - adv: 8.3
Park: 5 - adv: 8.1
Pub: 5 - adv: 7.6
Dessert Shop: 5 - adv: 7.1
Steakhouse: 5 - adv: 7.1
Breakfast Spot: 5 - adv: 6.3
Diner: 5 - adv: 6.3


#### Cluster 2

In [49]:
cluster2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, 
                                toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]
cluster2

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,"Moore Park,Summerhill East",Restaurant,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store


In [50]:
analyze_cluster(cluster2, 0)

Restaurant: 1 - adv: 1.9
Playground: 1 - adv: 1.8
Dance Studio: 1 - adv: 1.7
Electronics Store: 1 - adv: 1.6
Eastern European Restaurant: 1 - adv: 1.5
Dumpling Restaurant: 1 - adv: 1.4
Donut Shop: 1 - adv: 1.3
Doner Restaurant: 1 - adv: 1.2
Dog Run: 1 - adv: 1.1
Discount Store: 1 - adv: 1.0


#### Cluster 3

In [51]:
cluster3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, 
                                toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]
cluster3

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Roselawn,Garden,Wings Joint,Deli / Bodega,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run


In [52]:
analyze_cluster(cluster3, 0)

Garden: 1 - adv: 1.9
Wings Joint: 1 - adv: 1.8
Deli / Bodega: 1 - adv: 1.7
Ethiopian Restaurant: 1 - adv: 1.6
Electronics Store: 1 - adv: 1.5
Eastern European Restaurant: 1 - adv: 1.4
Dumpling Restaurant: 1 - adv: 1.3
Donut Shop: 1 - adv: 1.2
Doner Restaurant: 1 - adv: 1.1
Dog Run: 1 - adv: 1.0


#### Cluster 4

In [53]:
cluster4 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, 
                                toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]
cluster4

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Rosedale,Park,Trail,Playground,Dance Studio,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run
23,"Forest Hill North,Forest Hill West",Park,Jewelry Store,Trail,Sushi Restaurant,Wings Joint,Deli / Bodega,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [54]:
analyze_cluster(cluster4, 0)

Trail: 2 - adv: 3.5
Eastern European Restaurant: 2 - adv: 2.7
Dumpling Restaurant: 2 - adv: 2.5
Donut Shop: 2 - adv: 2.3
Doner Restaurant: 2 - adv: 2.1
Park: 1 - adv: 1.9
Jewelry Store: 1 - adv: 1.8
Playground: 1 - adv: 1.7
Dance Studio: 1 - adv: 1.6
Sushi Restaurant: 1 - adv: 1.6
Electronics Store: 1 - adv: 1.5
Wings Joint: 1 - adv: 1.5
Deli / Bodega: 1 - adv: 1.4
Dog Run: 1 - adv: 1.0


#### Cluster 5

In [55]:
cluster5 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, 
                                toronto_merged.columns[[2] + list(range(6, toronto_merged.shape[1]))]]
cluster5

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Lawrence Park,Bus Line,Swim School,Park,Colombian Restaurant,College Rec Center,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [56]:
analyze_cluster(cluster5, 0)

Bus Line: 1 - adv: 1.9
Swim School: 1 - adv: 1.8
Park: 1 - adv: 1.7
Colombian Restaurant: 1 - adv: 1.6
College Rec Center: 1 - adv: 1.5
Ethiopian Restaurant: 1 - adv: 1.4
Electronics Store: 1 - adv: 1.3
Eastern European Restaurant: 1 - adv: 1.2
Dumpling Restaurant: 1 - adv: 1.1
Donut Shop: 1 - adv: 1.0
