# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, lxml
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
# import folium # map rendering library
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
try:
    import folium
except:
    !pip install folium
    import folium

## Part 1: Reading the data

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
soup = BeautifulSoup(source)

table_data = soup.find('div', class_='mw-parser-output')
table = table_data.table.tbody

columns = ['PostalCode', 'Borough', 'Neighbourhood']
data = dict({key:[]*len(columns) for key in columns})

for row in table.find_all('tr'):
    for i,column in zip(row.find_all('td'),columns):
        i = i.text
        i = i.replace('\n', '')
        data[column].append(i)

df = pd.DataFrame.from_dict(data=data)[columns]
print(df.shape)
df.head()

(288, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


* Data contains some rows where value are not assigned.
* Dropping the row where Borough is Not assigned __(only)__.

In [4]:
df = df[df['Borough'] != 'Not assigned'].reset_index(drop = True)
print('After dropping rows where borough is "Not assigned", Shape is: ',df.shape)
print('Number of rows where Neighbourhood is "Not assigned" but borough has value: ', 
      df[df['Neighbourhood'] == 'Not assigned'].shape[0])

After dropping rows where borough is "Not assigned", Shape is:  (211, 3)
Number of rows where Neighbourhood is "Not assigned" but borough has value:  1


* There is only one row where Neighbourhood is Not assigned but Borough is assigned.
* Where Borough is not 'Not assigned' (means there a borugh) and Neighbourhood is 'Not assigned', borough will be the neighbourhood.

In [5]:
p, b, n = [], [], []
for postcode, borough, neigh in zip(df['PostalCode'], df['Borough'], df['Neighbourhood']):
    p.append(postcode)
    b.append(borough)
    if neigh == 'Not assigned':
        n.append(borough)
    else:
        n.append(neigh)

df = pd.DataFrame({'PostalCode': p, 'Borough': b, 'Neighbourhood':n})[columns]
print(df.shape)
df.head()

(211, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


* 77 rows has been dropped because there was no assigned value.
* Before data has 288 rows, now data has 211 rows.

**Merging the rows** 
* when Postcode and Borough are same but Neighbourhoods are different, then neighbourds will be seperated by ','.
* So Postcode and Borough pair can appear only once in the data, so we'll make a dictionary with (Postcode, Borough) as keys and the values will be corresponding neighbourhood/s.
* Creating a dataframe using keys and values of dictionary

In [6]:
postcodes = df['PostalCode'].values
boroughs = df['Borough'].values
neighs = df['Neighbourhood'].values

#create a dictionary with keys as Postcode and Borough, keys of dictioaries are unique
dic = dict({(key1,key2): [] for key1, key2 in zip(postcodes, boroughs)})
print('Number of keys in the dictionary are: ', len(dic.keys()))

#filling the values of keys of dictionary
for postcode, borough, neigh in zip(postcodes,boroughs, neighs):
    key = (postcode, borough)
    dic[key].append(neigh)

df = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighbourhood'])
for key, value in dic.items():
    postcode, borough, neig = key[0], key[1], value
    neig = ', '.join(neig)
    df = df.append({'Postal Code': postcode,
                     'Borough': borough,
                     'Neighbourhood': neig}, ignore_index = True)
print('Shape of final data is: ', df.shape)
df.head(10)

Number of keys in the dictionary are:  103
Shape of final data is:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M4J,East York,East Toronto
1,M3B,North York,Don Mills North
2,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
3,M5N,Central Toronto,Roselawn
4,M4P,Central Toronto,Davisville North
5,M2R,North York,Willowdale West
6,M4K,East Toronto,"The Danforth West, Riverdale"
7,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."
8,M4L,East Toronto,"The Beaches West, India Bazaar"
9,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"


## Part 2: Getting the Latitude and Longitude of each Postal_code

In [None]:
# import geocoder # import geocoder
# lats, lons = [], []
# count = 0
# for postal_code in df['PostalCode'].values:
#     # initialize your variable to None
#     lat_lng_coords = None
#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng_coords = g.latlng
#         lats.append(lat_lng_coords[0])
#         lons.append(lat_lng_coords[1])

In [7]:
# download the data of latitude and longitude: link provided by >>>>coursera
# !wget http://cocl.us/Geospatial_data
try:
    df['Latitude'] = lats
    df['Longitude'] = lons
except:
    latlon = pd.read_csv('Geospatial_data')
    df = pd.merge(df, latlon, how= 'inner', on = 'Postal Code')
    
print(df.shape)
df.head(10)

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4J,East York,East Toronto,43.685347,-79.338106
1,M3B,North York,Don Mills North,43.745906,-79.352188
2,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577
3,M5N,Central Toronto,Roselawn,43.711695,-79.416936
4,M4P,Central Toronto,Davisville North,43.712751,-79.390197
5,M2R,North York,Willowdale West,43.782736,-79.442259
6,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
7,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999
8,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
9,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029


## Part 3: Exploring and Clustering 

* Create a map of Toronto with neighbourhood superimposed on it

In [8]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.653963, -79.387207.


In [9]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [10]:
print('Toatl number of Borough = ', len(df['Borough'].unique()))

Toatl number of Borough =  11


* Let's explore one borough == Downtown Toronto, then we'll do the same for other boroughs

In [11]:
downtown_toronto = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
print(downtown_toronto.shape)
downtown_toronto.head()

(18, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
4,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568


* Let's visualize the neighbourhood of Downtown Toronto in map

In [12]:
address = 'Downtown Toronto ,Toronto, Ontario'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of New York using latitude and longitude values
map_dwontown = folium.Map(location=[latitude, longitude], zoom_start= 11)

# add markers to map
for lat, lng, borough, neighborhood in zip(downtown_toronto['Latitude'], downtown_toronto['Longitude'], 
                                           downtown_toronto['Borough'], downtown_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dwontown)  
    
map_dwontown

## Explore about the venues using Forsquare API

__Let's explore about venues first Neighbourhood of Downtown Toronto in our Dataframe__

In [13]:
#definig the latitude and longitude using above dataframe
lat = downtown_toronto.loc[0, 'Latitude'] # neighborhood latitude value
lon = downtown_toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = downtown_toronto.loc[0, 'Neighbourhood'] # neighborhood name
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, lat, lon))

CLIENT_ID = 'your_client_id' # your Foursquare ID
CLIENT_SECRET = 'your_client_secret' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100
radius =1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, lat,lon, radius, LIMIT)

# gettig the venues data form Forsquare API in json format
results = requests.get(url).json()
results

Latitude and longitude values of Rosedale are 43.6795626, -79.37752940000001.


{'meta': {'code': 200, 'requestId': '5d1a67b3429bfc0025d2db05'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4adcb343f964a520e32e21e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d118951735',
         'name': 'Grocery Store',
         'pluralName': 'Grocery Stores',
         'primary': True,
         'shortName': 'Grocery Store'}],
       'id': '4adcb343f964a520e32e21e3',
       'location': {'address': '446 Summerhill Ave',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'btwn. MacLennan Ave. and Glen Rd.',
        'distance': 764,
        'formattedAddress': ['446 Summerhill Ave (btwn. MacLennan Ave. and Glen Rd.)',
         'Toronto

* By lookig at the json output, we can see that *items* key and we get useful inforamtion using results['respnse']['groups'][0]['items']

In [14]:
venues = results['response']['groups'][0]['items']
venues_df = json_normalize(venues) # flatten JSON
venues_df.head(2)

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,...,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.neighborhood,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups
0,0,"[{'reasonName': 'globalInteractionReason', 'su...",e-0-4adcb343f964a520e32e21e3-0,"[{'pluralName': 'Grocery Stores', 'id': '4bf58...",4adcb343f964a520e32e21e3,446 Summerhill Ave,CA,Toronto,Canada,btwn. MacLennan Ave. and Glen Rd.,...,[446 Summerhill Ave (btwn. MacLennan Ave. and ...,"[{'lat': 43.68626482142425, 'lng': -79.3754582...",43.686265,-79.375458,,M4W 2E4,ON,Summerhill Market,0,[]
1,0,"[{'reasonName': 'globalInteractionReason', 'su...",e-0-4ba5156cf964a520b6da38e3-1,"[{'pluralName': 'Athletics & Sports', 'id': '4...",4ba5156cf964a520b6da38e3,44 Price St.,CA,Toronto,Canada,at Yonge St.,...,"[44 Price St. (at Yonge St.), Toronto ON, Canada]","[{'lat': 43.68066694338894, 'lng': -79.3885588...",43.680667,-79.388559,,,ON,Toronto Lawn Tennis Club,0,[]


* We can see there are many details about the venues near first neighbourhood of Downtown Toronto.
* But we're interested in only the venue name, categories, loaction (latitde and longitude).
* Let's take only those columns which are needed.

In [15]:
# filter columns
cols = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
venues_df = venues_df.loc[:, cols]
# to show whole text in columns in dataframe python: https://stackoverflow.com/a/25352191
pd.set_option("display.max_colwidth" , -1)
venues_df.head()

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Summerhill Market,"[{'pluralName': 'Grocery Stores', 'id': '4bf58dd8d48988d118951735', 'primary': True, 'name': 'Grocery Store', 'shortName': 'Grocery Store', 'icon': {'suffix': '.png', 'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_'}}]",43.686265,-79.375458
1,Toronto Lawn Tennis Club,"[{'pluralName': 'Athletics & Sports', 'id': '4f4528bc4b90abdf24c9de85', 'primary': True, 'name': 'Athletics & Sports', 'shortName': 'Athletics & Sports', 'icon': {'suffix': '.png', 'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/sports_outdoors_'}}]",43.680667,-79.388559
2,Black Camel,"[{'pluralName': 'BBQ Joints', 'id': '4bf58dd8d48988d1df931735', 'primary': True, 'name': 'BBQ Joint', 'shortName': 'BBQ', 'icon': {'suffix': '.png', 'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/bbqalt_'}}]",43.677016,-79.389367
3,Tinuno,"[{'pluralName': 'Filipino Restaurants', 'id': '4eb1bd1c3b7b55596b4a748f', 'primary': True, 'name': 'Filipino Restaurant', 'shortName': 'Filipino', 'icon': {'suffix': '.png', 'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/filipino_'}}]",43.671281,-79.37492
4,Pie Squared,"[{'pluralName': 'Pie Shops', 'id': '52e81612bcbc57f1066b7a0a', 'primary': True, 'name': 'Pie Shop', 'shortName': 'Pie Shop', 'icon': {'suffix': '.png', 'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/pieshop_'}}]",43.672143,-79.377856


In [16]:
# reset set option dataframe: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.reset_option.html
pd.reset_option('display.max_colwidth')

* Name, location (lat and lng) looks good but we still need to etract the catgories.
* Categories is in venue.categories[0]['name']

In [17]:
venues_df['venue.categories'] = venues_df.apply(lambda x: x['venue.categories'][0]['name'], axis=1)
venues_df.head()

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Summerhill Market,Grocery Store,43.686265,-79.375458
1,Toronto Lawn Tennis Club,Athletics & Sports,43.680667,-79.388559
2,Black Camel,BBQ Joint,43.677016,-79.389367
3,Tinuno,Filipino Restaurant,43.671281,-79.37492
4,Pie Squared,Pie Shop,43.672143,-79.377856


* Now every look good except the names of column let's change the name of columns

In [18]:
# clean columns
venues_df.columns = [col.split(".")[-1] for col in venues_df.columns]
print('{} Venues are returned for: {}'.format(venues_df.shape[0], neighborhood_name))
venues_df.head()

25 Venues are returned for: Rosedale


Unnamed: 0,name,categories,lat,lng
0,Summerhill Market,Grocery Store,43.686265,-79.375458
1,Toronto Lawn Tennis Club,Athletics & Sports,43.680667,-79.388559
2,Black Camel,BBQ Joint,43.677016,-79.389367
3,Tinuno,Filipino Restaurant,43.671281,-79.37492
4,Pie Squared,Pie Shop,43.672143,-79.377856


## Getting the venues for other neighbourhood of Downtown Torento

* Defining a function to get the deatails of each neighbourhood of Downtown Toronto

In [19]:
def get_near_by_venues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'\
        .format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, 
                             v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],
                             v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue in venues_list for item in venue])
    nearby_venues.columns = ['Neighbourhood','Neighbourhood Latitude', 'Neighbourhood Longitude', 
                             'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return nearby_venues

In [20]:
print('Finding the near by venues of: ')
donwntown_venues = get_near_by_venues(names=downtown_toronto['Neighbourhood'],latitudes=downtown_toronto['Latitude'],
                                   longitudes=downtown_toronto['Longitude'])

Finding the near by venues of: 


In [21]:
print(donwntown_venues.shape)
donwntown_venues.head()

(1288, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Mooredale House,43.678631,-79.380091,Building
1,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
2,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
3,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
4,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail


In [22]:
print('There are {} uniques categories.'.format(len(donwntown_venues['Venue Category'].unique())))
print('\n\nVenues returned for each neighbourhood: ')
donwntown_venues.groupby('Neighbourhood')['Venue'].count()

There are 209 uniques categories.


Venues returned for each neighbourhood: 


Neighbourhood
Adelaide, King, Richmond                                                                                      100
Berczy Park                                                                                                    55
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara     16
Cabbagetown, St. James Town                                                                                    46
Central Bay Street                                                                                             88
Chinatown, Grange Park, Kensington Market                                                                     100
Christie                                                                                                       15
Church and Wellesley                                                                                           87
Commerce Court, Victoria Hotel                                            

### Let's analyze each neighbourhood

In [23]:
# one hot encoding of venue categories columns:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
downtown_onehot = pd.get_dummies(donwntown_venues[['Venue Category']], prefix= "", prefix_sep= " ")

# # add neighborhood column back to dataframe
downtown_onehot['Neighbourhood'] = donwntown_venues['Neighbourhood'] 
# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]
print(downtown_onehot.shape)
downtown_onehot.head()

(1288, 210)


Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Rosedale,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


**Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [24]:
downtown_grouped = downtown_onehot.groupby('Neighbourhood').mean().reset_index()
print(downtown_grouped.shape)
downtown_grouped.head()

(18, 210)


Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011364,0.0,0.011364,0.0,0.011364,0.0,0.0,0.011364


**print each neighborhood along with the top 5 most common venues**

In [25]:
num_top_venues = 5

for hood in downtown_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    
    #first rows is not usefull it has only Neighbourhood and Neighbourhood-name, so drop this row
    temp = temp.iloc[1:]
    temp['freq'] = round(temp['freq'].astype(float),2)# converting into float type and # taking round values
    temp = temp.sort_values('freq', ascending=False).reset_index(drop=True) # sorting the dataframe by 'freq' in decreasing order
    print(temp.head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                  venue  freq
0           Coffee Shop  0.06
1                  Café  0.05
2            Steakhouse  0.04
3   American Restaurant  0.04
4                   Bar  0.04


----Berczy Park----
                 venue  freq
0          Coffee Shop  0.09
1         Cocktail Bar  0.05
2   Italian Restaurant  0.04
3                 Café  0.04
4          Cheese Shop  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
               venue  freq
0     Airport Lounge  0.12
1    Airport Service  0.12
2   Airport Terminal  0.12
3      Boat or Ferry  0.06
4    Harbor / Marina  0.06


----Cabbagetown, St. James Town----
                 venue  freq
0          Coffee Shop  0.09
1           Restaurant  0.07
2                  Pub  0.04
3   Italian Restaurant  0.04
4                 Café  0.04


----Central Bay Street----
                        venue  freq
0                 Coffee Shop  0.16

### Let's put Top venues into a pandas dataframe


In [27]:
def return_most_common_venues(row, num_top_venues):
    row = row.iloc[1:]
    row_sorted = row.sort_values(ascending=False)
    
    return row_sorted.index.values[0:num_top_venues]

In [28]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighbourhood'] = downtown_grouped['Neighbourhood']

for ind in np.arange(downtown_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Bakery,Gym,Cosmetics Shop,Hotel
1,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Bakery,Beer Bar,Steakhouse,Italian Restaurant,Cheese Shop,Café,Farmers Market
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Terminal,Airport Lounge,Boutique,Plane,Harbor / Marina,Coffee Shop,Bar,Sculpture Garden,Airport Gate
3,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Park,Pub,Café,Bakery,Italian Restaurant,Pizza Place,Breakfast Spot,Butcher
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Middle Eastern Restaurant,Sandwich Place,Burger Joint,Sushi Restaurant,Bubble Tea Shop,Bar,Bakery


### Clustering Neighbours

In [29]:
# I'm considering number of clusters as 5.
k = 5

X = downtown_grouped.drop('Neighbourhood', axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters = k, random_state=0)
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [30]:
# add clustering labels
venues_sorted['Cluster_Labels']=  kmeans.labels_

downtown_toronto_merged = downtown_toronto
# merge top venues_sorted with toronto_data
downtown_toronto_merged = downtown_toronto_merged.join(venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

downtown_toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster_Labels
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,Park,Playground,Trail,Building,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,1
1,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,Coffee Shop,Café,Italian Restaurant,Middle Eastern Restaurant,Sandwich Place,Burger Joint,Sushi Restaurant,Bubble Tea Shop,Bar,Bakery,0
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Middle Eastern Restaurant,Theater,Ramen Restaurant,Diner,Pizza Place,Bubble Tea Shop,0
3,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049,Café,Bookstore,Restaurant,Bar,Japanese Restaurant,Bakery,Italian Restaurant,Chinese Restaurant,Nightclub,Beer Store,2
4,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,Coffee Shop,Café,Steakhouse,Bar,American Restaurant,Thai Restaurant,Bakery,Gym,Cosmetics Shop,Hotel,0


In [31]:
# create map
map_clusterd = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_toronto_merged['Latitude'], downtown_toronto_merged['Longitude'],
                                  downtown_toronto_merged['Neighbourhood'], downtown_toronto_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusterd)
       
map_clusterd

## Let's do for other borough

**Below Fuction can be used to cluster and find top venues of neighbourhoods of a borough of Toronto**
* Clusters can be visualized in the map.

In [32]:
def explore_boriugh(b, n, cluster_k):
    new_df = df[df['Borough'] == b].reset_index(drop = True)
    print(new_df.shape)

    address = b+' ,Toronto, Ontario'
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude

    venues =  get_near_by_venues(names = new_df['Neighbourhood'],latitudes = new_df['Latitude'], longitudes = new_df['Longitude'])

    onehot_df = pd.get_dummies(venues[['Venue Category']], prefix= "", prefix_sep= " ")

    # # add neighborhood column back to dataframe
    onehot_df['Neighbourhood'] = new_df['Neighbourhood']
    # move neighborhood column to the first column
    fixed_columns = [onehot_df.columns[-1]] + list(onehot_df.columns[:-1])
    onehot_df = onehot_df[fixed_columns]
    onehot_df_grouped = onehot_df.groupby('Neighbourhood').mean().reset_index()

    num_top_venues = 10

    indicators = ['st', 'nd', 'rd']

    # create columns according to number of top venues
    columns = ['Neighbourhood']
    for ind in np.arange(num_top_venues):
        try:
            columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        except:
            columns.append('{}th Most Common Venue'.format(ind+1))

    # create a new dataframe
    venues_sorted = pd.DataFrame(columns=columns)
    venues_sorted['Neighbourhood'] = onehot_df_grouped['Neighbourhood']

    for ind in np.arange(onehot_df_grouped.shape[0]):
        venues_sorted.iloc[ind, 1:] = return_most_common_venues(onehot_df_grouped.iloc[ind, :], num_top_venues)

    k = cluster_k
    X = onehot_df_grouped.drop('Neighbourhood', axis = 1)

    # run k-means clustering
    kmeans = KMeans(n_clusters = k, random_state=0)
    kmeans.fit(X)

    # add clustering labels
    venues_sorted['Cluster_Labels']=  kmeans.labels_

    merged_df = new_df
    # merge top venues_sorted with toronto_data
    merged_df = merged_df.join(venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
    
    # create map
    borough_map = folium.Map(location=[latitude, longitude], zoom_start=11)

    # add markers to the map
    for lat, lon, poi, cluster in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Neighbourhood'], merged_df['Cluster_Labels']):
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(borough_map)

    return borough_map, merged_df

#### Use function 'explore_boriugh' to explore any Borough of Torornto

In [33]:
map1, data = explore_boriugh(b = 'Scarborough', n = 10, cluster_k = 3)

(17, 5)


In [34]:
map1

In [35]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster_Labels
0,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,Park,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,0
1,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029,Asian Restaurant,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,0
2,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.75741,-79.273304,Playground,Vietnamese Restaurant,Chinese Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,2
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,Department Store,Vietnamese Restaurant,Indian Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Convenience Store,0
4,M1R,Scarborough,"Maryvale, Wexford",43.750072,-79.295849,Coffee Shop,Vietnamese Restaurant,Indian Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,0
