<center>

## Segmenting and Clustering Neighbourhoods in Toronto
___

##### Install and Import Libraries

In [1]:
import pandas as pd

import matplotlib as mtp

import numpy as np

import requests 

from pandas.io.json import json_normalize

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
#!conda install -c conda-forge folium=0.5.0 --yes
import folium

import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

from sklearn.cluster import KMeans

print('Libraries imported.')

Libraries imported.


##### Import Data from Wikipedia :

In [2]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header = 0)

df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
df.shape

(180, 3)

##### Cleaning the table from 'Not assigned' values

In [4]:
df= df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
df.shape

(103, 3)

##### Group neighbourdhoods and boroughs with identical postal codes

In [6]:
df = df.groupby(['Postal Code', 'Borough', 'Neighbourhood']).agg({'Postal Code':lambda x: ', '.join(tuple(x.tolist())),

                                     'Neighbourhood':lambda x: ', '.join(tuple(x.tolist()))}
                                   )
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Postal Code,Neighbourhood
Postal Code,Borough,Neighbourhood,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",M1B,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",M1C,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn,M1G,Woburn
M1H,Scarborough,Cedarbrae,M1H,Cedarbrae


##### Merging neighbourdhoods and boroughs with identical postal codes

In [7]:
df = df.groupby('Borough').agg({'Postal Code':'first', 
                             'Neighbourhood': ', '.join 
                              }).reset_index()
df.head(20)

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,"Lawrence Park, Davisville North, North Toronto..."
1,Downtown Toronto,M4W,"Rosedale, St. James Town, Cabbagetown, Church ..."
2,East Toronto,M4E,"The Beaches, The Danforth West, Riverdale, Ind..."
3,East York,M4B,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,Etobicoke,M8V,"New Toronto, Mimico South, Humber Bay Shores, ..."
5,Mississauga,M7R,Canada Post Gateway Processing Centre
6,North York,M2H,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,Scarborough,M1B,"Malvern, Rouge, Rouge Hill, Port Union, Highla..."
8,West Toronto,M6H,"Dufferin, Dovercourt Village, Little Portugal,..."
9,York,M6C,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R..."


##### Change order label list

In [8]:
columnsList=["Postal Code","Borough", "Neighbourhood"]
df=df.reindex(columns=columnsList)
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto..."
1,M4W,Downtown Toronto,"Rosedale, St. James Town, Cabbagetown, Church ..."
2,M4E,East Toronto,"The Beaches, The Danforth West, Riverdale, Ind..."
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores, ..."
5,M7R,Mississauga,Canada Post Gateway Processing Centre
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol..."
7,M1B,Scarborough,"Malvern, Rouge, Rouge Hill, Port Union, Highla..."
8,M6H,West Toronto,"Dufferin, Dovercourt Village, Little Portugal,..."
9,M6C,York,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R..."


###### Getting geodata for Toronto

In [9]:
df_geo_coordinate = pd.read_csv('Geospatial_Coordinates.csv')
df_geo_coordinate.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df_geo_coordinate.shape

(103, 3)

In [11]:
df_geo_coordinate.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df_geo_coordinate.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto..."
1,M4W,Downtown Toronto,"Rosedale, St. James Town, Cabbagetown, Church ..."
2,M4E,East Toronto,"The Beaches, The Danforth West, Riverdale, Ind..."
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig..."
4,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores, ..."


In [13]:
df_merged = df.join(df_geo_coordinate.set_index('PostalCode'), on='Postal Code')
df_merged.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']

df_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto...",43.72802,-79.38879
1,M4W,Downtown Toronto,"Rosedale, St. James Town, Cabbagetown, Church ...",43.679563,-79.377529
2,M4E,East Toronto,"The Beaches, The Danforth West, Riverdale, Ind...",43.676357,-79.293031
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig...",43.706397,-79.309937
4,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores, ...",43.605647,-79.501321
5,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
6,M2H,North York,"Hillcrest Village, Fairview, Henry Farm, Oriol...",43.803762,-79.363452
7,M1B,Scarborough,"Malvern, Rouge, Rouge Hill, Port Union, Highla...",43.806686,-79.194353
8,M6H,West Toronto,"Dufferin, Dovercourt Village, Little Portugal,...",43.669005,-79.442259
9,M6C,York,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R...",43.693781,-79.428191


##### Clustering and segmentation

In [14]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [15]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighborhood']):
    label = '{}, {}'.format(Neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

##### Using a neighborhood to explore Foursquare API

In [33]:
neighborhood_latitude = df_merged.loc[8, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_merged.loc[8, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_merged.loc[8, 'Neighborhood'] # neighborhood name

borough_name = df_merged.loc[8, 'Borough'] # Borough name

print('Latitude and longitude values of {} at {} are {}, {}.'.format(neighborhood_name,
                                                               borough_name,
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Dufferin, Dovercourt Village, Little Portugal, Trinity, Brockton, Parkdale Village, Exhibition Place, High Park, The Junction South, Parkdale, Roncesvalles, Runnymede, Swansea at West Toronto are 43.66900510000001, -79.4422593.


##### Define credentials

In [34]:
CLIENT_ID = 'JSPX5JW3JS0IMMPZFV4RC3OKYMNMNMGUJUETGTDCVD2XEMA2' # your Foursquare ID
CLIENT_SECRET = 'INY1M5ETAYK2FYYBFLCRVYV0M2XLBKMAR2S3C4WGDUFLIJ2H' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JSPX5JW3JS0IMMPZFV4RC3OKYMNMNMGUJUETGTDCVD2XEMA2
CLIENT_SECRET:INY1M5ETAYK2FYYBFLCRVYV0M2XLBKMAR2S3C4WGDUFLIJ2H


In [35]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60191eff7b18381149a6da26'},
 'response': {'headerLocation': 'Davenport',
  'headerFullLocation': 'Davenport, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 17,
  'suggestedBounds': {'ne': {'lat': 43.67350510450001,
    'lng': -79.43604977526607},
   'sw': {'lat': 43.664505095500004, 'lng': -79.44846882473394}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5753753b498eeb535c53aed5',
       'name': 'The Greater Good Bar',
       'location': {'address': '229 Geary St',
        'crossStreet': 'at Dufferin St',
        'lat': 43.669409,
        'lng': -79.439267,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.669409,
          'lng': -79.439267}],
        'distance': 245,
        'postalC

##### Order Categories as NY ex

In [36]:
## get category as New York excerise
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [37]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,The Greater Good Bar,Bar,43.669409,-79.439267
1,Parallel,Middle Eastern Restaurant,43.669516,-79.438728
2,Blood Brothers Brewing,Brewery,43.669944,-79.436533
3,FreshCo,Grocery Store,43.667918,-79.440754
4,Happy Bakery & Pastries,Bakery,43.66705,-79.441791


In [38]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

17 venues were returned by Foursquare.


In [39]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


##### Check Now Boroughs and create a map

In [43]:
list_boroughs = df_merged['Borough'].unique()
list_boroughs

array(['Central Toronto', 'Downtown Toronto', 'East Toronto', 'East York',
       'Etobicoke', 'Mississauga', 'North York', 'Scarborough',
       'West Toronto', 'York'], dtype=object)

In [44]:
def borough_loc(list_of_places):
    for place in list_of_places:
        address = (place + ", Ontario, Canada")
        geolocator = Nominatim(user_agent="TO_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('{''}, {}, {},'.format(place,latitude,longitude))

borough_loc(list_boroughs)

Central Toronto, 43.6534817, -79.3839347,
Downtown Toronto, 43.6563221, -79.3809161,
East Toronto, 43.626243, -79.396962,
East York, 43.699971000000005, -79.33251996261595,
Etobicoke, 43.6435559, -79.5656326,
Mississauga, 43.5896231, -79.6443879,
North York, 43.7543263, -79.44911696639593,
Scarborough, 43.7729744, -79.2576479,
West Toronto, 43.6534817, -79.3839347,
York, 43.6896191, -79.479188,


In [53]:
import numpy as np

boroughs = ['Scarborough', 43.773077, -79.257774,
'North York', 43.7708175, -79.4132998,
'East York', 43.6913391, -79.3278212,
'East Toronto', 43.653963, -79.387207,
'Central Toronto', 43.653963, -79.387207,
'Downtown Toronto', 43.655115, -79.380219,
'York', 44.0007518, -79.4372217,
'West Toronto', 43.653963, -79.387207,
"Queen's Park", 43.6599803, -79.3903686,
'Mississauga', 43.590338, -79.645729,
'Etobicoke', 43.6435559, -79.5656326]

boroughs_df = pd.DataFrame(np.array(boroughs).reshape(11,3), columns = ["Borough","Latitude","Longitude"])

boroughs_df

Unnamed: 0,Borough,Latitude,Longitude
0,Scarborough,43.773077,-79.257774
1,North York,43.7708175,-79.4132998
2,East York,43.6913391,-79.3278212
3,East Toronto,43.653963,-79.387207
4,Central Toronto,43.653963,-79.387207
5,Downtown Toronto,43.655115,-79.380219
6,York,44.0007518,-79.4372217
7,West Toronto,43.653963,-79.387207
8,Queen's Park,43.6599803,-79.3903686
9,Mississauga,43.590338,-79.645729


In [55]:
boroughs_df.dtypes

boroughs_df["Latitude"] = pd.to_numeric(boroughs_df["Latitude"])
boroughs_df["Longitude"] = pd.to_numeric(boroughs_df["Longitude"])

In [56]:
# create a map of Toronto
map_toronto_boroughs = folium.Map(location = [43.653963, -79.387207], zoom_start = 10)

#add neighborhood markers to the Toronto map
for lat, long, bor in zip(boroughs_df['Latitude'], boroughs_df['Longitude'], 
                                 boroughs_df['Borough']):
    label = '{}'.format(bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius = 7, 
        popup = label,
        color = 'red',
        fill = True,
        fill_color = 'white',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto_boroughs)
        
map_toronto_boroughs

##### Venues into DataFrame

In [66]:
   def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                lat, 
                lng, 
                VERSION, 
                radius, 
                LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [75]:
DT_venues = getNearbyVenues(names=df_merged['Neighborhood'],
                                   latitudes=df_merged['Latitude'],
                                   longitudes=df_merged['Longitude']
                                  )

Lawrence Park, Davisville North, North Toronto West, Lawrence Park, Davisville, Moore Park, Summerhill East, Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park, Roselawn, Forest Hill North & West, Forest Hill Road Park, The Annex, North Midtown, Yorkville
Rosedale, St. James Town, Cabbagetown, Church and Wellesley, Regent Park, Harbourfront, Garden District, Ryerson, St. James Town, Berczy Park, Central Bay Street, Richmond, Adelaide, King, Harbourfront East, Union Station, Toronto Islands, Toronto Dominion Centre, Design Exchange, Commerce Court, Victoria Hotel, University of Toronto, Harbord, Kensington Market, Chinatown, Grange Park, CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport, Stn A PO Boxes, First Canadian Place, Underground city, Christie, Queen's Park, Ontario Provincial Government
The Beaches, The Danforth West, Riverdale, India Bazaar, The Beaches West, Studio District, Business reply mail Processing

(80, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Lawrence Park, Davisville North, North Toronto...",43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,"Lawrence Park, Davisville North, North Toronto...",43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,"Lawrence Park, Davisville North, North Toronto...",43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,"Rosedale, St. James Town, Cabbagetown, Church ...",43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
4,"Rosedale, St. James Town, Cabbagetown, Church ...",43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park


##### Here we have the venues for each group of neighborhood

In [72]:
DT_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada Post Gateway Processing Centre,14,14,14,14,14,14
"Dufferin, Dovercourt Village, Little Portugal, Trinity, Brockton, Parkdale Village, Exhibition Place, High Park, The Junction South, Parkdale, Roncesvalles, Runnymede, Swansea",17,17,17,17,17,17
"Hillcrest Village, Fairview, Henry Farm, Oriole, Bayview Village, York Mills, Silver Hills, Willowdale, Newtonbrook, Willowdale, Willowdale East, York Mills West, Willowdale, Willowdale West, Parkwoods, Don Mills, Don Mills, Bathurst Manor, Wilson Heights, Downsview North, Northwood Park, York University, Downsview, Downsview, Downsview, Downsview, Victoria Village, Bedford Park, Lawrence Manor East, Lawrence Manor, Lawrence Heights, Glencairn, North Park, Maple Leaf Park, Upwood Park, Humber Summit, Humberlea, Emery",5,5,5,5,5,5
"Humewood-Cedarvale, Caledonia-Fairbanks, Del Ray, Mount Dennis, Keelsdale and Silverthorn, Runnymede, The Junction North, Weston",4,4,4,4,4,4
"Lawrence Park, Davisville North, North Toronto West, Lawrence Park, Davisville, Moore Park, Summerhill East, Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park, Roselawn, Forest Hill North & West, Forest Hill Road Park, The Annex, North Midtown, Yorkville",3,3,3,3,3,3
"Malvern, Rouge, Rouge Hill, Port Union, Highland Creek, Guildwood, Morningside, West Hill, Woburn, Cedarbrae, Scarborough Village, Kennedy Park, Ionview, East Birchmount Park, Golden Mile, Clairlea, Oakridge, Cliffside, Cliffcrest, Scarborough Village West, Birch Cliff, Cliffside West, Dorset Park, Wexford Heights, Scarborough Town Centre, Wexford, Maryvale, Agincourt, Clarks Corners, Tam O'Shanter, Sullivan, Milliken, Agincourt North, Steeles East, L'Amoreaux East, Steeles West, L'Amoreaux West, Upper Rouge",2,2,2,2,2,2
"New Toronto, Mimico South, Humber Bay Shores, Alderwood, Long Branch, The Kingsway, Montgomery Road, Old Mill North, Old Mill South, King's Mill Park, Sunnylea, Humber Bay, Mimico NE, The Queensway East, Royal York South East, Kingsway Park South East, Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West, Islington Avenue, Humber Valley Village, West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale, Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood, Westmount, Kingsview Village, St. Phillips, Martin Grove Gardens, Richview Gardens, South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens, Northwest, West Humber - Clairville",16,16,16,16,16,16
"Parkview Hill, Woodbine Gardens, Woodbine Heights, Leaside, Thorncliffe Park, East Toronto, Broadview North (Old East York)",10,10,10,10,10,10
"Rosedale, St. James Town, Cabbagetown, Church and Wellesley, Regent Park, Harbourfront, Garden District, Ryerson, St. James Town, Berczy Park, Central Bay Street, Richmond, Adelaide, King, Harbourfront East, Union Station, Toronto Islands, Toronto Dominion Centre, Design Exchange, Commerce Court, Victoria Hotel, University of Toronto, Harbord, Kensington Market, Chinatown, Grange Park, CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport, Stn A PO Boxes, First Canadian Place, Underground city, Christie, Queen's Park, Ontario Provincial Government",4,4,4,4,4,4
"The Beaches, The Danforth West, Riverdale, India Bazaar, The Beaches West, Studio District, Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",5,5,5,5,5,5


In [76]:
print('There are {} uniques categories.'.format(len(DT_venues['Venue Category'].unique())))

There are 47 uniques categories.


##### Lets analyze each neighborhood venue

In [77]:
# one hot encoding
DT_onehot = pd.get_dummies(DT_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
DT_onehot['Neighborhood'] = DT_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [DT_onehot.columns[-1]] + list(DT_onehot.columns[:-1])
DT_onehot = DT_onehot[fixed_columns]

DT_onehot.head()

Unnamed: 0,Trail,American Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Brewery,Burrito Place,Bus Line,...,Pizza Place,Playground,Pool,Print Shop,Pub,Restaurant,Sandwich Place,Seafood Restaurant,Supermarket,Swim School
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
DT_onehot.shape

(80, 47)

##### Grouping up rows by neighborhood and by taking means of the frequency of occurrence of each category

In [79]:
DT_grouped = DT_onehot.groupby('Neighborhood').mean().reset_index()
DT_grouped

Unnamed: 0,Neighborhood,Trail,American Restaurant,Athletics & Sports,Bakery,Bank,Bar,Breakfast Spot,Brewery,Burrito Place,...,Pizza Place,Playground,Pool,Print Shop,Pub,Restaurant,Sandwich Place,Seafood Restaurant,Supermarket,Swim School
0,Canada Post Gateway Processing Centre,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0
1,"Dufferin, Dovercourt Village, Little Portugal,...",0.0,0.0,0.058824,0.117647,0.058824,0.058824,0.0,0.058824,0.0,...,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0
2,"Hillcrest Village, Fairview, Henry Farm, Oriol...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R...",0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Lawrence Park, Davisville North, North Toronto...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
5,"Malvern, Rouge, Rouge Hill, Port Union, Highla...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
6,"New Toronto, Mimico South, Humber Bay Shores, ...",0.0,0.0625,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,...,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0,0.0
7,"Parkview Hill, Woodbine Gardens, Woodbine Heig...",0.0,0.0,0.1,0.0,0.1,0.0,0.1,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Rosedale, St. James Town, Cabbagetown, Church ...",0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"The Beaches, The Danforth West, Riverdale, Ind...",0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0


##### Now lets print each neighborhood with their three top venues

In [80]:
num_top_venues = 3

for hood in DT_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = DT_grouped[DT_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Canada Post Gateway Processing Centre----
          venue  freq
0   Coffee Shop  0.21
1         Hotel  0.14
2  Intersection  0.07


----Dufferin, Dovercourt Village, Little Portugal, Trinity, Brockton, Parkdale Village, Exhibition Place, High Park, The Junction South, Parkdale, Roncesvalles, Runnymede, Swansea----
          venue  freq
0        Bakery  0.12
1      Pharmacy  0.12
2  Liquor Store  0.06


----Hillcrest Village, Fairview, Henry Farm, Oriole, Bayview Village, York Mills, Silver Hills, Willowdale, Newtonbrook, Willowdale, Willowdale East, York Mills West, Willowdale, Willowdale West, Parkwoods, Don Mills, Don Mills, Bathurst Manor, Wilson Heights, Downsview North, Northwood Park, York University, Downsview, Downsview, Downsview, Downsview, Victoria Village, Bedford Park, Lawrence Manor East, Lawrence Manor, Lawrence Heights, Glencairn, North Park, Maple Leaf Park, Upwood Park, Humber Summit, Humberlea, Emery----
                      venue  freq
0                   Dog R

##### Creating a dataframe with all we've seen

In [82]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [83]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = DT_grouped['Neighborhood']

for ind in np.arange(DT_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(DT_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Canada Post Gateway Processing Centre,Coffee Shop,Hotel,Gym,Burrito Place,Food Truck,Intersection,Mediterranean Restaurant,Middle Eastern Restaurant,Fried Chicken Joint,Sandwich Place
1,"Dufferin, Dovercourt Village, Little Portugal,...",Bakery,Pharmacy,Music Venue,Café,Liquor Store,Middle Eastern Restaurant,Supermarket,Park,Pet Store,Brewery
2,"Hillcrest Village, Fairview, Henry Farm, Oriol...",Golf Course,Pool,Fast Food Restaurant,Mediterranean Restaurant,Dog Run,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market
3,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R...",Trail,Hockey Arena,Field,Playground,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Fast Food Restaurant
4,"Lawrence Park, Davisville North, North Toronto...",Swim School,Park,Bus Line,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field,Fast Food Restaurant,Dog Run
5,"Malvern, Rouge, Rouge Hill, Port Union, Highla...",Print Shop,Fast Food Restaurant,Swim School,Business Service,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field,Dog Run
6,"New Toronto, Mimico South, Humber Bay Shores, ...",Coffee Shop,Gym,Business Service,Café,Mexican Restaurant,Fried Chicken Joint,Pet Store,Pharmacy,Pizza Place,Fast Food Restaurant
7,"Parkview Hill, Woodbine Gardens, Woodbine Heig...",Pizza Place,Gastropub,Athletics & Sports,Gym / Fitness Center,Bank,Breakfast Spot,Flea Market,Pharmacy,Intersection,Coffee Shop
8,"Rosedale, St. James Town, Cabbagetown, Church ...",Park,Playground,Trail,Athletics & Sports,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field
9,"The Beaches, The Danforth West, Riverdale, Ind...",Trail,Health Food Store,Pub,Coffee Shop,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field


##### Clustering neighbourhoods in downtown Toronto

In [84]:
# run K-means to cluster the neighborhoods into 5 clusters

# set number of clusters
kclusters = 5

DT_grouped_clustering = DT_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(DT_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 0, 4, 2, 1, 1, 3, 1])

In [88]:
# add labels
neighborhoods_venues_sorted.insert(0, 'Clusters', kmeans.labels_)

DT_merge = df_merged

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
DT_merge = DT_merge.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

DT_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Clusters,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,"Lawrence Park, Davisville North, North Toronto...",43.72802,-79.38879,4,4,Swim School,Park,Bus Line,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field,Fast Food Restaurant,Dog Run
1,M4W,Downtown Toronto,"Rosedale, St. James Town, Cabbagetown, Church ...",43.679563,-79.377529,3,3,Park,Playground,Trail,Athletics & Sports,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field
2,M4E,East Toronto,"The Beaches, The Danforth West, Riverdale, Ind...",43.676357,-79.293031,1,1,Trail,Health Food Store,Pub,Coffee Shop,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field
3,M4B,East York,"Parkview Hill, Woodbine Gardens, Woodbine Heig...",43.706397,-79.309937,1,1,Pizza Place,Gastropub,Athletics & Sports,Gym / Fitness Center,Bank,Breakfast Spot,Flea Market,Pharmacy,Intersection,Coffee Shop
4,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores, ...",43.605647,-79.501321,1,1,Coffee Shop,Gym,Business Service,Café,Mexican Restaurant,Fried Chicken Joint,Pet Store,Pharmacy,Pizza Place,Fast Food Restaurant


##### Checking the Clusters

# #1

In [89]:
DT_merge.loc[DT_merge['Cluster'] == 0, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Clusters,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,"Humewood-Cedarvale, Caledonia-Fairbanks, Del R...",0,0,Trail,Hockey Arena,Field,Playground,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Fast Food Restaurant


# #2

In [90]:
DT_merge.loc[DT_merge['Cluster'] == 1, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Clusters,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,"The Beaches, The Danforth West, Riverdale, Ind...",1,1,Trail,Health Food Store,Pub,Coffee Shop,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field
3,"Parkview Hill, Woodbine Gardens, Woodbine Heig...",1,1,Pizza Place,Gastropub,Athletics & Sports,Gym / Fitness Center,Bank,Breakfast Spot,Flea Market,Pharmacy,Intersection,Coffee Shop
4,"New Toronto, Mimico South, Humber Bay Shores, ...",1,1,Coffee Shop,Gym,Business Service,Café,Mexican Restaurant,Fried Chicken Joint,Pet Store,Pharmacy,Pizza Place,Fast Food Restaurant
5,Canada Post Gateway Processing Centre,1,1,Coffee Shop,Hotel,Gym,Burrito Place,Food Truck,Intersection,Mediterranean Restaurant,Middle Eastern Restaurant,Fried Chicken Joint,Sandwich Place
6,"Hillcrest Village, Fairview, Henry Farm, Oriol...",1,1,Golf Course,Pool,Fast Food Restaurant,Mediterranean Restaurant,Dog Run,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market
8,"Dufferin, Dovercourt Village, Little Portugal,...",1,1,Bakery,Pharmacy,Music Venue,Café,Liquor Store,Middle Eastern Restaurant,Supermarket,Park,Pet Store,Brewery


# #3

In [92]:
DT_merge.loc[DT_merge['Cluster'] == 2, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Clusters,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,"Malvern, Rouge, Rouge Hill, Port Union, Highla...",2,2,Print Shop,Fast Food Restaurant,Swim School,Business Service,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field,Dog Run


# #4

In [93]:
DT_merge.loc[DT_merge['Cluster'] == 3, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Clusters,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,"Rosedale, St. James Town, Cabbagetown, Church ...",3,3,Park,Playground,Trail,Athletics & Sports,Café,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field


# #5

In [94]:
DT_merge.loc[DT_merge['Cluster'] == 4, DT_merge.columns[[2] + list(range(5, DT_merge.shape[1]))]]

Unnamed: 0,Neighborhood,Clusters,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Lawrence Park, Davisville North, North Toronto...",4,4,Swim School,Park,Bus Line,Gastropub,Fried Chicken Joint,Food Truck,Flea Market,Field,Fast Food Restaurant,Dog Run


____

##### That was all folks! :)