# Segmenting and Clustering Neighborhoods in Toronto

In [86]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from opencage.geocoder import OpenCageGeocode

In [60]:
# df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df=pd.read_csv('mmp.csv')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [40]:
df1=df[df.Borough!='Not assigned']
df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.

In [99]:
df2=df1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [100]:
df2.Neighbourhood.loc[df2.Neighbourhood=='Not assigned']=df2.Borough
len(df2[df2.Neighbourhood=='Not assigned'])

0

In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [101]:
df2.shape

(103, 3)

In [102]:
# because I'm based in China and the google api has connection problem. I switch to another geocoder api
key = '20b19d9ca52c4af8b4ee86a53b693aa6'
geocoder = OpenCageGeocode(key)
adict={}
import time
start = time.time()

for i in df2.Postcode:
    address = i + ', ' + 'Toronto, Ontario, Canada'
    location = geocoder.geocode(address)
    latlng = location[0]['geometry']
    adict[i]=latlng
    print(i,latlng,str(time.time()-start)+'s')
adict

KeyboardInterrupt: 

In [106]:
latlng=pd.DataFrame.from_dict(adict,orient='index').reset_index()
latlng.columns=['Postcode','latitude','longitude']




In [108]:
df3=pd.merge(df2,latlng,how='left', on=['Postcode'])
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
0,M1B,Scarborough,"Rouge,Malvern",43.653963,-79.387207
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.653963,-79.387207
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7678,-79.1866
3,M1G,Scarborough,Woburn,43.765717,-79.221898
4,M1H,Scarborough,Cedarbrae,43.7686,-79.2389


In [109]:
df3.to_csv('toronto_geo.csv')

In [116]:
import numpy as np
import folium

In [118]:
df_toronto=df3.dropna()
toronto_latitude = 43.6532; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['latitude'], df_toronto['longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

In [121]:
CLIENT_ID = 'WGY32GYMOR10DO0A4VJZ3YDJML5H4PFBONGBH0SNCCNF2QR0'
CLIENT_SECRET= 'LIUOD1RYNQI5QA1AVDK302Z5ENCRIBZNAMY2GYHORMV1RWAB'
VERSION = '20191201'

In [120]:
df_toronto.Borough.value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           11
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [183]:
ny_data = df_toronto[df_toronto['Borough'] == 'North York'].reset_index(drop=True)
ny_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
0,M2H,North York,Hillcrest Village,43.8015,-79.3577
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.779772,-79.366185
2,M2K,North York,Bayview Village,43.7797,-79.3813
3,M2L,North York,"Silver Hills,York Mills",43.7547,-79.3764
4,M2M,North York,"Newtonbrook,Willowdale",43.785962,-79.416031


### Create a map of North York and its neighbourhoods

In [126]:
location = geocoder.geocode('North York, Toronto, Ontario, Canada')
address_ny = 'North York,Toronto'
latitude_ny = location[0]['geometry']['lat']
longitude_ny = location[0]['geometry']['lng']
print('The geograpical coordinate of North York are {}, {}.'.format(latitude_ny, longitude_ny))

The geograpical coordinate of North York are 43.7708175, -79.4132998.


In [228]:
map_ny = folium.Map(location=[latitude_ny, longitude_ny], zoom_start=12)

# add markers to map
for lat, lng, label in zip(ny_data['latitude'], ny_data['longitude'], ny_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.5).add_to(map_ny)  
    
map_ny

### Get the top 100 venues in the neighborhood 'Newtonbrook,Willowdale', from North York

In [132]:
neighborhood_latitude = ny_data.loc[4, 'latitude'] # neighbourhood latitude value
neighborhood_longitude = ny_data.loc[4, 'longitude'] # neighbourhood longitude value
neighborhood_name = ny_data.loc[4, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of "{}" are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of "Newtonbrook,Willowdale" are 43.7859621, -79.4160308.


In [136]:
import requests
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude_ny, longitude_ny, VERSION, radius, LIMIT)

In [137]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dfa3db9aba2972743c1e29c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Willowdale',
  'headerFullLocation': 'Willowdale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 51,
  'suggestedBounds': {'ne': {'lat': 43.7753175045, 'lng': -79.40707971454917},
   'sw': {'lat': 43.7663174955, 'lng': -79.41951988545084}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '563d44fccd1044ad67a744fb',
       'name': "The Captain's Boil",
       'location': {'address': '5313 Yonge St',
        'lat': 43.773255217045026,
        'lng': -79.41380541792645,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.7732552170

In [146]:
import json 
from pandas.io.json import json_normalize

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [147]:
ny_venues = getNearbyVenues(names=ny_data['Neighbourhood'],
                                   latitudes=ny_data['latitude'],
                                   longitudes=ny_data['longitude']
                                  )
ny_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.8015,-79.3577,Woodbrooke Estate,43.802067,-79.354347,Residential Building (Apartment / Condo)
1,Hillcrest Village,43.8015,-79.3577,McNicoll Park,43.798994,-79.35284,Park
2,"Fairview,Henry Farm,Oriole",43.779772,-79.366185,Jerusalem Restaurant,43.778863,-79.364273,Middle Eastern Restaurant
3,"Fairview,Henry Farm,Oriole",43.779772,-79.366185,Bow Thai,43.778729,-79.364065,Thai Restaurant
4,"Fairview,Henry Farm,Oriole",43.779772,-79.366185,Shoppers Drug Mart,43.778896,-79.364603,Pharmacy


In [143]:
ny_venues.groupby('Neighborhood')['Venue'].count()

Neighborhood
Bathurst Manor,Downsview North,Wilson Heights    21
Bayview Village                                   4
Bedford Park,Lawrence Manor East                 25
CFB Toronto,Downsview East                        4
Don Mills North                                   4
Downsview Central                                 1
Downsview Northwest                              19
Downsview West                                    2
Downsview,North Park,Upwood Park                  1
Emery,Humberlea                                   6
Fairview,Henry Farm,Oriole                       12
Flemingdon Park,Don Mills South                  42
Glencairn                                         8
Hillcrest Village                                 2
Humber Summit                                     3
Lawrence Heights,Lawrence Manor                  73
Newtonbrook,Willowdale                           18
Northwood Park,York University                    9
Parkwoods                                        76

In [145]:
ny_venues['Venue Category'].unique()

array(['Residential Building (Apartment / Condo)', 'Park',
       'Middle Eastern Restaurant', 'Thai Restaurant', 'Pharmacy',
       'Coffee Shop', 'Pizza Place', 'Wine Shop', 'Convenience Store',
       'Trail', 'Dog Run', 'Flower Shop', 'Pool', 'Cafeteria',
       'Korean Restaurant', 'Café', 'Bank', 'Japanese Restaurant',
       'Sandwich Place', 'Indian Restaurant', 'Electronics Store',
       'Hot Dog Joint', 'Fast Food Restaurant', 'Train Station',
       'Ski Chalet', 'Bubble Tea Shop', 'Creperie', 'Burrito Place',
       'Ramen Restaurant', 'Food Stand', 'Sushi Restaurant', 'Juice Bar',
       'Wings Joint', 'Udon Restaurant', 'Dessert Shop', 'Playground',
       'Restaurant', 'Chinese Restaurant', 'Dumpling Restaurant',
       'Vietnamese Restaurant', 'Grocery Store', 'Gym', 'Salad Place',
       'Fried Chicken Joint', 'Cosmetics Shop', 'Noodle House',
       'Karaoke Bar', 'Neighborhood', 'Poke Place', 'Tea Room',
       'Ice Cream Shop', 'Hotel', 'Plaza', 'Gym / Fitness Cent

In [170]:
# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood'] 

# move neighborhood column to the first column
columns = ny_onehot.columns.values.tolist()
columns.sort(key = 'Neighborhood'.__eq__)
ny_onehot = ny_onehot[columns]

ny_onehot.head()

Unnamed: 0,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,...,Train Station,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Shop,Wings Joint,Women's Store,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Hillcrest Village
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Hillcrest Village
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Fairview,Henry Farm,Oriole"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Fairview,Henry Farm,Oriole"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Fairview,Henry Farm,Oriole"


In [171]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()
ny_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bakery,Bank,Bar,...,Trail,Train Station,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Shop,Wings Joint,Women's Store
0,"Bathurst Manor,Downsview North,Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park,Lawrence Manor East",0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto,Downsview East",0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# top venues

In [172]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [173]:
num_top_venues = 8

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,"Bathurst Manor,Downsview North,Wilson Heights",Park,Coffee Shop,Middle Eastern Restaurant,Bridal Shop,Pharmacy,Diner,Community Center,Restaurant
1,Bayview Village,Trail,Dog Run,Flower Shop,Park,Fast Food Restaurant,Fried Chicken Joint,French Restaurant,Food Stand
2,"Bedford Park,Lawrence Manor East",Italian Restaurant,Coffee Shop,Sushi Restaurant,Pizza Place,Cosmetics Shop,Pub,Fast Food Restaurant,Comfort Food Restaurant
3,"CFB Toronto,Downsview East",Coffee Shop,Airport,Food Court,Shoe Store,Fast Food Restaurant,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant
4,Don Mills North,Park,Pool,Basketball Court,Furniture / Home Store,Women's Store,Fried Chicken Joint,French Restaurant,Food Stand
5,Downsview Central,Baseball Field,Women's Store,Gastropub,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Stand
6,Downsview Northwest,Grocery Store,Discount Store,Pizza Place,Shopping Mall,Liquor Store,Fried Chicken Joint,Gas Station,Sandwich Place
7,Downsview West,Pool,Child Care Service,Women's Store,Flower Shop,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Stand
8,"Downsview,North Park,Upwood Park",Bakery,Women's Store,Flower Shop,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Stand
9,"Emery,Humberlea",Coffee Shop,Latin American Restaurant,Café,Grocery Store,Nightclub,Discount Store,Donut Shop,Dumpling Restaurant


# Kmeans to cluster the neighborhood into 3

In [191]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4

ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_
#len(kmeans.labels_)#=16

array([0, 0, 0, 0, 0, 3, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0], dtype=int32)

In [192]:
ny_merged = ny_data

# add clustering labels
ny_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

ny_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue
0,M2H,North York,Hillcrest Village,43.8015,-79.3577,0,Residential Building (Apartment / Condo),Park,Women's Store,Fast Food Restaurant,Fried Chicken Joint,French Restaurant,Food Stand,Food Court
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.779772,-79.366185,0,Park,Pharmacy,Wine Shop,Convenience Store,Middle Eastern Restaurant,Pizza Place,Coffee Shop,Thai Restaurant
2,M2K,North York,Bayview Village,43.7797,-79.3813,0,Trail,Dog Run,Flower Shop,Park,Fast Food Restaurant,Fried Chicken Joint,French Restaurant,Food Stand
3,M2L,North York,"Silver Hills,York Mills",43.7547,-79.3764,0,Pool,Cafeteria,Women's Store,Flower Shop,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Stand
4,M2M,North York,"Newtonbrook,Willowdale",43.785962,-79.416031,0,Korean Restaurant,Coffee Shop,Convenience Store,Café,Indian Restaurant,Hot Dog Joint,Fast Food Restaurant,Electronics Store


# Visualization

In [229]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location = [latitude_ny, longitude_ny], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
color = [colors.rgb2hex(i) for i in colors_array]
color=['red','green','']
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['latitude'], ny_merged['longitude'], ny_merged['Neighbourhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color[cluster-1],
        fill=True,
        fill_color=color[cluster-1],
        fill_opacity=1).add_to(map_clusters)
       
map_clusters