### Import df from previous notebook

In [1]:
import pandas as pd
df = pd.read_csv('df.csv',index_col=0)
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.785730,-79.158750
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765690,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
5,M1J,Scarborough,Scarborough Village,43.743125,-79.231750
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.726245,-79.263670
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713133,-79.285055
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.696665,-79.260163


### Use geopy library to get the latitude and longitude values of Toronto.

In [2]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [3]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Import folium

In [4]:
#import folium
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    altair-3.1.0               |           py36_0         724 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

### Create a map of Toronto with neighborhoods superimposed on top.

In [5]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Lets limit the analysis to 'Downtown Toronto'

In [6]:
df_downtown = df[df.Borough=='Downtown Toronto'].reset_index(drop=True)
df_downtown

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.68194,-79.378474
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.66816,-79.366602
2,M4Y,Downtown Toronto,Church and Wellesley,43.666585,-79.381302
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65512,-79.36264
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657363,-79.37818
5,M5C,Downtown Toronto,St. James Town,43.65121,-79.375481
6,M5E,Downtown Toronto,Berczy Park,43.64516,-79.373675
7,M5G,Downtown Toronto,Central Bay Street,43.656091,-79.38493
8,M5H,Downtown Toronto,"Adelaide,King,Richmond",43.649515,-79.382503
9,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.62347,-79.391507


### Lets get coordinates for Downtown Toronto

In [7]:
address = 'Downtown Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.655115, -79.380219.


### Let's visualize Downtown Toronto and its neighbourhoods 

In [12]:
# create map of Manhattan using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(df_downtown['Latitude'], df_downtown['Longitude'], df_downtown['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

### Set up variables for Foursquare usage

In [22]:
CLIENT_ID = 'F35ANHGVOAPSUJONBKYCORPYO0HZDPK2TEE0VDJ010Q2DG1K' # your Foursquare ID
CLIENT_SECRET = 'O3WZAT45SFHWDEWPHWBH1XLWQA533BPUZGB1UNOWT5JV4JAI' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 30 # I will only work with the 30 closest venues

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

### Create function to get nearby venues in Downtown Toronto

In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
        
    return(nearby_venues)

### Run the function for each neighbourhood in Downtown Toronto

In [24]:
downtown_venues = getNearbyVenues(names=df_downtown['Neighbourhood'],
                                   latitudes=df_downtown['Latitude'],
                                   longitudes=df_downtown['Longitude']
                                  )

Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront,Regent Park
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie


### Check the new dataframe 

In [25]:
downtown_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.681940,-79.378474,TD Canada Trust,43.686094,-79.376549,Bank
1,Rosedale,43.681940,-79.378474,Rosedale Park,43.682328,-79.378934,Playground
2,Rosedale,43.681940,-79.378474,Whitney Park,43.682036,-79.373788,Park
3,Rosedale,43.681940,-79.378474,Mooredale House,43.678631,-79.380091,Building
4,"Cabbagetown,St. James Town",43.668160,-79.366602,F'Amelia,43.667536,-79.368613,Italian Restaurant
5,"Cabbagetown,St. James Town",43.668160,-79.366602,Cranberries,43.667843,-79.369407,Diner
6,"Cabbagetown,St. James Town",43.668160,-79.366602,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
7,"Cabbagetown,St. James Town",43.668160,-79.366602,Kingyo Toronto,43.665895,-79.368415,Japanese Restaurant
8,"Cabbagetown,St. James Town",43.668160,-79.366602,Murgatroid,43.667381,-79.369311,Restaurant
9,"Cabbagetown,St. James Town",43.668160,-79.366602,Merryberry Cafe + Bistro,43.666630,-79.368792,Café


### Let's check # of venues by neighbourhood

In [27]:
downtown_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",30,30,30,30,30,30
Berczy Park,30,30,30,30,30,30
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",30,30,30,30,30,30
"Cabbagetown,St. James Town",30,30,30,30,30,30
Central Bay Street,30,30,30,30,30,30
"Chinatown,Grange Park,Kensington Market",30,30,30,30,30,30
Christie,9,9,9,9,9,9
Church and Wellesley,30,30,30,30,30,30
"Commerce Court,Victoria Hotel",30,30,30,30,30,30
"Design Exchange,Toronto Dominion Centre",30,30,30,30,30,30


### Let's see how many unique categories we have

In [28]:
print('There are {} unique categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 124 unique categories.


In [38]:
downtown_venues['Neighborhood']

0                                   Rosedale
1                                   Rosedale
2                                   Rosedale
3                                   Rosedale
4                 Cabbagetown,St. James Town
5                 Cabbagetown,St. James Town
6                 Cabbagetown,St. James Town
7                 Cabbagetown,St. James Town
8                 Cabbagetown,St. James Town
9                 Cabbagetown,St. James Town
10                Cabbagetown,St. James Town
11                Cabbagetown,St. James Town
12                Cabbagetown,St. James Town
13                Cabbagetown,St. James Town
14                Cabbagetown,St. James Town
15                Cabbagetown,St. James Town
16                Cabbagetown,St. James Town
17                Cabbagetown,St. James Town
18                Cabbagetown,St. James Town
19                Cabbagetown,St. James Town
20                Cabbagetown,St. James Town
21                Cabbagetown,St. James Town
22        

### Use onehot encoding to create yes/no values for occurance of a category in each row

In [60]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# move Neighborhood column to be the first
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood']
downtown_onehot = downtown_onehot[ ['Neighborhood'] + [ col for col in downtown_onehot.columns if col != 'Neighborhood' ] ]

downtown_onehot.head(11)

Unnamed: 0,Neighborhood,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bakery,Bank,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"Cabbagetown,St. James Town",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Let's group rows by neighborhood and by taking the mean of the requency of occurance of each category

In [61]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped

Unnamed: 0,Neighborhood,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bakery,Bank,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Yoga Studio
0,"Adelaide,King,Richmond",0.066667,0.0,0.0,0.0,0.033333,0.0,0.0,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,Berczy Park,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.033333,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.033333,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Chinatown,Grange Park,Kensington Market",0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.033333,0.0,...,0.033333,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.066667,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0
8,"Commerce Court,Victoria Hotel",0.033333,0.033333,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Design Exchange,Toronto Dominion Centre",0.033333,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0


### Create a dataframe with the 5 most frequent venue category in the vicinity of the neighborhood

In [64]:
#write function

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#run it for the dataframe
import numpy as np

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(25)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide,King,Richmond",Café,Coffee Shop,American Restaurant,Steakhouse,Gluten-free Restaurant
1,Berczy Park,Cocktail Bar,Café,Farmers Market,Seafood Restaurant,Beer Bar
2,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Italian Restaurant,Restaurant,Park,Yoga Studio,Pub
3,"Cabbagetown,St. James Town",Coffee Shop,Café,Restaurant,Italian Restaurant,Bakery
4,Central Bay Street,Coffee Shop,Tea Room,Bubble Tea Shop,Clothing Store,Ramen Restaurant
5,"Chinatown,Grange Park,Kensington Market",Café,Vietnamese Restaurant,Cocktail Bar,Mexican Restaurant,Arts & Crafts Store
6,Christie,Café,Grocery Store,Coffee Shop,Playground,Baby Store
7,Church and Wellesley,Coffee Shop,Men's Store,Gay Bar,Burger Joint,Mexican Restaurant
8,"Commerce Court,Victoria Hotel",Café,Coffee Shop,Restaurant,Gastropub,Hotel
9,"Design Exchange,Toronto Dominion Centre",Coffee Shop,Restaurant,Café,Deli / Bodega,Hotel


### Cluster

In [80]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:25]

array([0, 0, 3, 3, 3, 0, 4, 3, 3, 3, 0, 3, 2, 3, 1, 0, 3, 0], dtype=int32)

### Creating new dataframe including clusters

In [81]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

In [84]:
# Increased clusters from 3 to 5 and want to overwrite old clusters with new (can not insert as they already exist)
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_

In [85]:
downtown_merged = df_downtown

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

downtown_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.68194,-79.378474,1,Park,Bank,Playground,Building,Electronics Store
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.66816,-79.366602,3,Coffee Shop,Café,Restaurant,Italian Restaurant,Bakery
2,M4Y,Downtown Toronto,Church and Wellesley,43.666585,-79.381302,3,Coffee Shop,Men's Store,Gay Bar,Burger Joint,Mexican Restaurant
3,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65512,-79.36264,3,Coffee Shop,Breakfast Spot,Restaurant,Gym / Fitness Center,Italian Restaurant
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657363,-79.37818,0,Coffee Shop,Café,Ramen Restaurant,Clothing Store,Fast Food Restaurant


### Create map

In [87]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighbourhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### As expected the outliers (further from the rest of the neighborhoods) got in a different category.
### The majority of neighborhoods fall in either cluster 0 or cluster 3