In [1]:
!conda install -c conda-forge geopy --yes 
!conda install -c conda-forge folium=0.5.0 --yes

print('Folium installed')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        18.9 MB  conda-forge
    libcblas-3.8.0             |      11_openblas        

In [2]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
import array as arr

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import folium # plotting library

### Steps
1. get the Toranto data, and filter boroughs that contain the word Toronto
2. for each neighbourhood, find the venues around, with the type of venues or venue catagory
3. 

#### Read the toronto data generated from the previous section

In [3]:
td_raw = pd.read_csv('toranto_neigh_latlong.csv', index_col=0)

In [4]:
#get neighbourhoods in the 'Toranto' borough
td = td_raw[td_raw['Borough'].str.contains('Toronto')]

In [5]:
td.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
7,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
16,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
17,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
33,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [6]:
#reset the index numbering
td.reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
...,...,...,...,...,...
69,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
70,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.382280
71,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.382280
72,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160


#### Get nearby places from FourSquare

In [7]:
#for a given lat and long for the neighbourhood, find the near by places

In [8]:
# @hidden_cell
#credentials from Foursquare
CLIENT_ID='VX13V3SLZUC3QSTZEB10QYJYV2YHN1UJ40UQWFL5AGPS2T3N'
CLIENT_SECRET='MZSBCYJZBNKMSHA1GW4H1FWILQZKIFWOXTZKTVNQW4DZJLQ1'

In [9]:
# assign key values 
VERSION = '20180604'
LIMIT = 10
creds = [CLIENT_ID,CLIENT_SECRET,VERSION]

In [10]:
# @hidden_cell
# self: use the json structure to loop through and put it into dataframe
# for ex: to get the category name of the first venue - ['response']['groups'][0]['items'][0]['venue']['categories'][0]['name']
# where ever there are more than one elements, for ex: beach can be categorized under water bodies and resorts etc., that would be
# in a list and can be accessed using []

#### Function to loop through each borough and identify the venues around it

In [11]:
def explore_borough(df, creds, radius, limit):
    
    clientid = creds[0]
    clientsec = creds[1]
    version = creds[2]
    
    venuelevelres = []
    venue_info = []
    
    df_venue_info = pd.DataFrame()
    
    for index, row in df.iterrows():
        postcode = row['Postcode']
        neighbourhood = row['Neighbourhood']
        borough = row['Borough']
        lat = row['Latitude']
        lng = row['Longitude']
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            clientid, 
            clientsec, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
        response = requests.get(url).json()
        # get the details at the venue level from the response.
        # item level information,in turn will have category level information which will be looped through
        
        # we need the result at category level. so first get the venue level list and then loop through the list to get the category level info
        # results['response']['groups'][0]['items'][0]['venue']['categories'][0]['name']
        venuelevelres = response['response']['groups'][0]['items']
        for v in venuelevelres:
            venuename = v['venue']['name']
            venuelat = v['venue']['location']['lat']
            venuelng = v['venue']['location']['lng']
            venuecat = v['venue']['categories'][0]['name']
            
            #append the current venue details to the list. finally this will have all the venues for a boroguh
            venue_info.append([neighbourhood, postcode, borough, lat, lng, venuename, venuelat, venuelng, venuecat])
        
    df_venue_info = pd.DataFrame(venue_info, columns = ['neighbourhood','postcode','borough','lat','lng','venue','vlat','vlng','vcat'])
    return df_venue_info

In [12]:
df_venue = explore_borough(td, creds, 500, 100)

In [13]:
#use this to save queries to Foursquare
df_venue = pd.read_csv('venue_list.csv', index_col=0)

In [14]:
df_venue.shape

(3226, 9)

In [15]:
df_venue.head()

Unnamed: 0,neighbourhood,postcode,borough,lat,lng,venue,vlat,vlng,vcat
0,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [22]:
df_venue.to_csv('venue_list.csv')

#### We have collected the venues around the neighbourhoods. now, lets cluster them.
##### to cluster them, we will use the category of the venues. based on that, we will fit KMeans model to cluster the venues

#### Clustering using KMeans

In [23]:
df_clust_base = df_venue[['neighbourhood','vcat']]
df_clust_base.set_index('neighbourhood', drop=True, inplace=True)
df_clust_base.head()

Unnamed: 0_level_0,vcat
neighbourhood,Unnamed: 1_level_1
Harbourfront,Bakery
Harbourfront,Coffee Shop
Harbourfront,Gym / Fitness Center
Harbourfront,Spa
Harbourfront,Restaurant


In [24]:
df_clust_morevenues = df_clust_base.groupby(by=['neighbourhood']).count()


In [25]:
df_clust_morevenues.sort_values(by=['vcat'], ascending=False, inplace=True)
df_clust_morevenues.head(10)

Unnamed: 0_level_0,vcat
neighbourhood,Unnamed: 1_level_1
"Adelaide, King, Richmond",300
"Harbourfront East, Toronto Islands, Union Station",300
"Chinatown, Grange Park, Kensington Market",252
"Design Exchange, Toronto Dominion Centre",200
"Ryerson, Garden District",200
"Commerce Court, Victoria Hotel",200
"First Canadian Place, Underground city",200
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",126
"Little Portugal, Trinity",108
St. James Town,100


#### The above result shows how many 'venues' are around each neighbourhoood

### Lets work on Clustering

In [26]:
df_clust_ohe = pd.get_dummies(df_clust_base['vcat'])

In [27]:
df_clust_ohe.head()

Unnamed: 0_level_0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Harbourfront,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Harbourfront,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Harbourfront,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Harbourfront,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Harbourfront,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df_clust_ohe_mn = df_clust_ohe.groupby(by='neighbourhood').mean()

In [29]:
df_clust_ohe_mn.head()

Unnamed: 0_level_0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0
Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df_clust_neigh = pd.DataFrame()
df_clust_neigh['Neighbourhood'] = df_clust_ohe_mn.index
df_clust_neigh.head()

Unnamed: 0,Neighbourhood
0,"Adelaide, King, Richmond"
1,Berczy Park
2,"Brockton, Exhibition Place, Parkdale Village"
3,Business Reply Mail Processing Centre 969 Eastern
4,"CN Tower, Bathurst Quay, Island airport, Harbo..."


### Clustering using KMeans
#### Lets try with various k values and see how the clusturing is doing

In [31]:
from sklearn.cluster import KMeans
k=8

KMC = KMeans(n_clusters=k, random_state=1).fit(df_clust_ohe_mn)


In [32]:
KMC.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       7, 1, 3, 1, 1, 1, 4, 6, 1, 1, 1, 1, 1, 1, 0, 1, 1], dtype=int32)

#### after various k values, k=7 seem to be optimal so lets stick to that

##### We have got the clustered labels. lets plug it with the neighbourhoods

In [33]:
df_clust_neigh['cluster']=KMC.labels_
df_clust_neigh.set_index('Neighbourhood', drop=True, inplace=True)
df_clust_neigh.head()

Unnamed: 0_level_0,cluster
Neighbourhood,Unnamed: 1_level_1
"Adelaide, King, Richmond",1
Berczy Park,1
"Brockton, Exhibition Place, Parkdale Village",1
Business Reply Mail Processing Centre 969 Eastern,1
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",1


In [34]:
df_td = td
df_td.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
7,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
16,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
17,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
33,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


#### merge the clustered values with the initial data source

In [35]:
df_venue_clustered = td.join(df_clust_neigh['cluster'], on='Neighbourhood')

In [36]:
df_venue_clustered.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,cluster
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1
7,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1
16,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1
17,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,1
33,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1


In [37]:
df_venue_clustered.shape

(74, 6)

### Lets look at what are the top categories are by neighbourhoods

In [40]:
df_clust_ohe_mn.head()

Unnamed: 0_level_0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0
Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0
"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df_venue.head()


Unnamed: 0,neighbourhood,postcode,borough,lat,lng,venue,vlat,vlng,vcat
0,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [121]:
ranklimit=1
grouped_full = pd.DataFrame()
#for index, row in df_venue[['neighbourhood','vcat','venue']].iterrows():
for i in df_venue.neighbourhood.unique():
    df_temp = df_venue[['neighbourhood','vcat','venue']][df_venue['neighbourhood']==i]
    grouped = df_temp.groupby(['neighbourhood','vcat']).count()
    grouped['rank'] = grouped['venue'].rank(method='first', ascending=False)
    grouped = grouped[grouped['rank']<=ranklimit].sort_values(by='rank', ascending=True)
    grouped_full = grouped_full.append(grouped)

grouped_full.rename(columns={'venue':'count'}, inplace=True)
grouped_full.reset_index(['neighbourhood','vcat'], inplace=True)
grouped_full

Unnamed: 0,neighbourhood,vcat,count,rank
0,Harbourfront,Coffee Shop,9,1.0
1,Queen's Park,Coffee Shop,9,1.0
2,"Ryerson, Garden District",Coffee Shop,16,1.0
3,St. James Town,Coffee Shop,8,1.0
4,The Beaches,Health Food Store,1,1.0
5,Berczy Park,Coffee Shop,4,1.0
6,Central Bay Street,Coffee Shop,12,1.0
7,Christie,Grocery Store,4,1.0
8,"Adelaide, King, Richmond",Coffee Shop,21,1.0
9,"Dovercourt Village, Dufferin",Bakery,4,1.0


In [122]:
grouped_full.vcat.unique()

array(['Coffee Shop', 'Health Food Store', 'Grocery Store', 'Bakery',
       'Bar', 'Greek Restaurant', 'Breakfast Spot', 'Park', 'Café',
       'Bus Line', 'Garden', 'Jewelry Store', 'Clothing Store',
       'Gift Shop', 'Dessert Shop', 'Gym', 'Airport Service',
       'Light Rail Station'], dtype=object)

### looks like Coffee Shops and Cafes are the top places around most of the nieghbourhoods

### Lets put the neighbourhoods on the map with clusturing

In [124]:
# starting lat and lng
latitude = 43.651070 
longitude = -79.347015 
print('Toranto lat and lng : {}, {}.'.format(latitude, longitude))

Toranto lat and lng : 43.65107, -79.347015.


In [125]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
palette = [colors.rgb2hex(i) for i in colors_array]

for index, row in df_venue_clustered.iterrows():
    lat = row['Latitude']
    lng = row['Longitude']
    cluster = row['cluster']
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color=palette[cluster],
        fill=False,
        fill_color=palette[cluster],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters