# Segmenting and Clustering Neighborhoods in Toronto

# Part 1

In [1]:
import pandas as pd #import pandas
import numpy as np #import numpy

In [2]:
!conda install --yes -c anaconda beautifulsoup4 #install BeautifulSoup to extract html files

from urllib.request import urlopen #used to open URLs
from bs4 import BeautifulSoup

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



### Step 1:
Specify the URL containing the dataset and pass it to urlopen() to get the html of the page.

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" #url of List of postal codes
html = urlopen(url)

### Step 2:
Create a Beautiful Soup object for the html. The Beautiful Soup package is used to parse the html, that is, take the raw html text and break it into Python objects. The second argument 'lxml' is the html parser


In [4]:
!pip install lxml #Install lxml then restart the kernel and comment the install command
soup = BeautifulSoup(html, 'lxml') #
type(soup)




bs4.BeautifulSoup

In [5]:
# Get the title
title = soup.title
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [6]:
# Print out the text
text = soup.get_text()
#print(soup.text)

### Step 3: Access the table
Find the list of tables in the website. Since there are two tables, we are interested in the first table.

In [7]:
tables = soup.find_all('table')
table = tables[:1]

In [8]:
rows = soup.table.find_all('tr') 
print(rows[:10])

[<tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>, <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>, <tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>, <tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>, <tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>, <tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>, <tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>]


Get all table rows in list form first and then convert that list into a dataframe. Below is a for loop that iterates through table rows and prints out the cells of the rows. Use BeautifulSoup object to remove the html tags and apply get_text() function to extract the texts within the html tags.

In [9]:
#import re
list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    text = BeautifulSoup(str_cells, 'lxml').get_text()
    list_rows.append(text)
print(text)
type(text)

[M9Z
, Not assigned
, Not assigned
]


str

In [10]:
df = pd.DataFrame(list_rows)
df.head(10)

Unnamed: 0,0
0,[]
1,"[M1A\n, Not assigned\n, Not assigned\n]"
2,"[M2A\n, Not assigned\n, Not assigned\n]"
3,"[M3A\n, North York\n, Parkwoods\n]"
4,"[M4A\n, North York\n, Victoria Village\n]"
5,"[M5A\n, Downtown Toronto\n, Regent Park, Harbo..."
6,"[M6A\n, North York\n, Lawrence Manor, Lawrence..."
7,"[M7A\n, Downtown Toronto\n, Queen's Park, Onta..."
8,"[M8A\n, Not assigned\n, Not assigned\n]"
9,"[M9A\n, Etobicoke\n, Islington Avenue, Humber ..."


Each row in the dataframe is separated by comma. Since a Neighborhood may contain multiple cities separated by comma, we use \n to split each row in to three columns instead of comma.

In [11]:
df1 = df[0].str.split('\n', expand=True)
df1.head(10)

Unnamed: 0,0,1,2,3
0,[],,,
1,[M1A,", Not assigned",", Not assigned",]
2,[M2A,", Not assigned",", Not assigned",]
3,[M3A,", North York",", Parkwoods",]
4,[M4A,", North York",", Victoria Village",]
5,[M5A,", Downtown Toronto",", Regent Park, Harbourfront",]
6,[M6A,", North York",", Lawrence Manor, Lawrence Heights",]
7,[M7A,", Downtown Toronto",", Queen's Park, Ontario Provincial Government",]
8,[M8A,", Not assigned",", Not assigned",]
9,[M9A,", Etobicoke",", Islington Avenue, Humber Valley Village",]


In [12]:
df1[0] = df1[0].str.strip('[') #remove [ mark from the first row
df1.head(10)

Unnamed: 0,0,1,2,3
0,],,,
1,M1A,", Not assigned",", Not assigned",]
2,M2A,", Not assigned",", Not assigned",]
3,M3A,", North York",", Parkwoods",]
4,M4A,", North York",", Victoria Village",]
5,M5A,", Downtown Toronto",", Regent Park, Harbourfront",]
6,M6A,", North York",", Lawrence Manor, Lawrence Heights",]
7,M7A,", Downtown Toronto",", Queen's Park, Ontario Provincial Government",]
8,M8A,", Not assigned",", Not assigned",]
9,M9A,", Etobicoke",", Islington Avenue, Humber Valley Village",]


In [13]:
col_labels = soup.table.find_all('th') # Find the column names

Extract the column names using BeautifulSoup object and get_text() function, then convert to a dataframe.

In [14]:
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

['[Postal Code\n, Borough\n, Neighborhood\n]']


In [15]:
df2 = pd.DataFrame(all_header)
df2.head()

Unnamed: 0,0
0,"[Postal Code\n, Borough\n, Neighborhood\n]"


In [16]:
df3 = df2[0].str.split(',', expand=True) #split the column names using comma 
df3.head()

Unnamed: 0,0,1,2
0,[Postal Code\n,Borough\n,Neighborhood\n]


In [17]:
frames = [df3, df1]

df4 = pd.concat(frames) # concatnate the column names with the data
df4.head(10)

Unnamed: 0,0,1,2,3
0,[Postal Code\n,Borough\n,Neighborhood\n],
0,],,,
1,M1A,", Not assigned",", Not assigned",]
2,M2A,", Not assigned",", Not assigned",]
3,M3A,", North York",", Parkwoods",]
4,M4A,", North York",", Victoria Village",]
5,M5A,", Downtown Toronto",", Regent Park, Harbourfront",]
6,M6A,", North York",", Lawrence Manor, Lawrence Heights",]
7,M7A,", Downtown Toronto",", Queen's Park, Ontario Provincial Government",]
8,M8A,", Not assigned",", Not assigned",]


In [18]:
df5 = df4.rename(columns=df4.iloc[0])
df5.head()

Unnamed: 0,[Postal Code\n,Borough\n,Neighborhood\n],NaN
0,[Postal Code\n,Borough\n,Neighborhood\n],
0,],,,
1,M1A,", Not assigned",", Not assigned",]
2,M2A,", Not assigned",", Not assigned",]
3,M3A,", North York",", Parkwoods",]


In [19]:
df7 = df5.drop(df5.index[0]) #remove the first row which is a duplicated value for the column names
df7.head()

Unnamed: 0,[Postal Code\n,Borough\n,Neighborhood\n],NaN
1,M1A,", Not assigned",", Not assigned",]
2,M2A,", Not assigned",", Not assigned",]
3,M3A,", North York",", Parkwoods",]
4,M4A,", North York",", Victoria Village",]
5,M5A,", Downtown Toronto",", Regent Park, Harbourfront",]


In [20]:
df7.columns = df7.columns.str.replace('\n', '') #replace the line break with empty value
df7.columns = df7.columns.str.replace('[', '')  #replace [ with empty value
df7.columns = df7.columns.str.replace(']', '')  #replace ] with empty value
df7.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,NaN
1,M1A,", Not assigned",", Not assigned",]
2,M2A,", Not assigned",", Not assigned",]
3,M3A,", North York",", Parkwoods",]
4,M4A,", North York",", Victoria Village",]
5,M5A,", Downtown Toronto",", Regent Park, Harbourfront",]


In [21]:
df7.rename(columns={'Postal Code': 'PostalCode', ' Borough': 'Borough', ' Neighborhood':'Neighborhood'},inplace=True) #correct name of columns according to the required names
df7.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,PostalCode,Borough,Neighborhood,NaN
1,M1A,", Not assigned",", Not assigned",]
2,M2A,", Not assigned",", Not assigned",]
3,M3A,", North York",", Parkwoods",]
4,M4A,", North York",", Victoria Village",]
5,M5A,", Downtown Toronto",", Regent Park, Harbourfront",]


In [22]:
df7 = df7[['PostalCode', 'Borough', 'Neighborhood']] #remove the last column which we are not interested
df7.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,", Not assigned",", Not assigned"
2,M2A,", Not assigned",", Not assigned"
3,M3A,", North York",", Parkwoods"
4,M4A,", North York",", Victoria Village"
5,M5A,", Downtown Toronto",", Regent Park, Harbourfront"


In [23]:
df7['Borough'] = df7['Borough'].str.strip(',') # strip comma from Borough column
df7.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,", Not assigned"
2,M2A,Not assigned,", Not assigned"
3,M3A,North York,", Parkwoods"
4,M4A,North York,", Victoria Village"
5,M5A,Downtown Toronto,", Regent Park, Harbourfront"


In [24]:
#Strip the first comma from Neighborhood
df7['Neighborhood'] = df7['Neighborhood'].str[1:] #Since we get extra column at the first position of each value we remove comma

#Strip the space in the front
df7['Borough'] = df7['Borough'].str[1:] #remove Extra space which is at the first position
df7.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [25]:
df7.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"


Remove all rows which have Not assigned values in the Borough column

In [26]:
df7.drop(df7[df7['Borough'] == 'Not assigned'].index, inplace = True)
df7.head(10)
#df7.drop(idxN, inplace = True)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


Reset the index to start with 0

In [27]:
df7.reset_index(drop = True, inplace = True)

In [28]:

df7.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [29]:
#check if Neighborhood has 'Not assigned' value
df7[df7['Neighborhood'] == 'Not assigned'].index

Int64Index([], dtype='int64')

Since we have 'Not assigned' value in Neighborhood, we don't need to replace with the corresponding Borough value

In [30]:
df7.shape

(103, 3)

# Part 2

Access the csv file which contains the geographical coordinates of each postal code from the link:
<a href = 'http://cocl.us/Geospatial_data'> Postal geographical coordinates </a>

In [31]:
geoLoc = pd.read_csv('http://cocl.us/Geospatial_data')
geoLoc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename the column **Postal Code** to comply with the original data frame

In [32]:
geoLoc.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)
geoLoc.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the two dataframes based on their **PostalCode** (inner merge): for merging details click here: <a href = 'https://realpython.com/pandas-merge-join-and-concat/'> RealPython</a>

In [33]:
postLoc = pd.merge(df7, geoLoc)
postLoc.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [34]:
postLoc.shape

(103, 5)

In [35]:
Downtown_Toronto = postLoc[postLoc['Borough'] == 'Downtown Toronto'].reset_index(drop = True)
Downtown_Toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Isl...",43.640816,-79.381752
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


# Part 3

### Import libraries

In [36]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         393 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0

The following packages will b

Lets find the geographical Location of Downtown Toronto

In [37]:
address = 'Downtown Toronto'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081164513409.


### Create a Map of Downtown Toronto with neighbors superimposed on it

In [38]:
# create map of New York using latitude and longitude values
map_downtown_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Downtown_Toronto['Latitude'], Downtown_Toronto['Longitude'], Downtown_Toronto['Borough'], Downtown_Toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_Toronto)  
    
map_downtown_Toronto

Lets start utilizing the Foursquare API to explore the neighborhoods and segment them.

### Define Foursquare API credentials

In [39]:
CLIENT_ID = 'TYTATKL2Z01PHMKDVJUN32V1OUQHYUMKQVJFAR11NVOR4R12' # your Foursquare ID
CLIENT_SECRET = '5EZLKAHX3SAKU45TTZACCOQYW0ZDUFT4G30AECZFMS32CVJ0' # your Foursquare Secret
VERSION = '20200623' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TYTATKL2Z01PHMKDVJUN32V1OUQHYUMKQVJFAR11NVOR4R12
CLIENT_SECRET:5EZLKAHX3SAKU45TTZACCOQYW0ZDUFT4G30AECZFMS32CVJ0


## Explore Neighborhoods in Downtown Toronto

#### Lets create a function to explore the top 100 popular venues that are in each Neighborhood within 500 meter radius by sending GET requests and examining them.

In [40]:
LIMIT = 100

In [41]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [42]:

Downtown_toronto_venues = getNearbyVenues(names=Downtown_Toronto['PostalCode'],
                                   latitudes=Downtown_Toronto['Latitude'],
                                   longitudes=Downtown_Toronto['Longitude']
                                  )


### Lets check the size of resulting dataframe

In [43]:
print(Downtown_toronto_venues.shape)
Downtown_toronto_venues.head()

(1218, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


### Check how many venues were returned for each neighborhood

In [44]:
Downtown_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M4W,4,4,4,4,4,4
M4X,46,46,46,46,46,46
M4Y,75,75,75,75,75,75
M5A,44,44,44,44,44,44
M5B,100,100,100,100,100,100
M5C,78,78,78,78,78,78
M5E,56,56,56,56,56,56
M5G,64,64,64,64,64,64
M5H,92,92,92,92,92,92
M5J,100,100,100,100,100,100


### Let's find out how many unique venue categories are curated from all the returned venues 

In [45]:
print('There are {} uniques categories.'.format(len(Downtown_toronto_venues['Venue Category'].unique())))

There are 213 uniques categories.


## Analyze each Neighborhood

### Let's create one hot encoding for the dataframe

In [46]:
# one hot encoding
Downtown_toronto_onehot = pd.get_dummies(Downtown_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Downtown_toronto_onehot['Neighborhood'] = Downtown_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
#cols = list(Downtown_toronto_onehot.columns)
#fixed_columns = [cols[-1]] + cols[:-1]
#Downtown_toronto_onehot = Downtown_toronto_onehot[fixed_columns]

Downtown_toronto_onehot.head()

Unnamed: 0,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
Neighborhood_col = Downtown_toronto_onehot['Neighborhood']
Downtown_toronto_onehot.drop(labels=['Neighborhood'], axis=1, inplace = True)
Downtown_toronto_onehot.insert(0, 'Neighborhood', Neighborhood_col)
Downtown_toronto_onehot.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
ind = np.argwhere(Downtown_toronto_onehot.columns.isin(['Neighborhood'])).ravel()
ind

array([0])

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [49]:
Downtown_toronto_grouped = Downtown_toronto_onehot.groupby('Neighborhood').mean().reset_index()
Downtown_toronto_grouped

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,M4W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021739,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M4Y,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,...,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.026667
3,M5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
4,M5B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0
5,M5C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,...,0.0,0.0,0.0,0.012821,0.0,0.0,0.012821,0.0,0.0,0.0
6,M5E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0
7,M5G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.0,0.015625
8,M5H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01087,0.0,...,0.0,0.0,0.0,0.01087,0.0,0.0,0.0,0.0,0.01087,0.0
9,M5J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0


### Create the new dataframe and display the top 10 venues for each neighborhood.

Let's first create a function to sort the venues in descending order

In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [51]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Downtown_toronto_grouped['Neighborhood']

for ind in np.arange(Downtown_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Downtown_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Park,Playground,Trail,Cosmetics Shop,Dog Run,Distribution Center,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop
1,M4X,Café,Coffee Shop,Restaurant,Pub,Pizza Place,Italian Restaurant,Bakery,Park,Grocery Store,Office
2,M4Y,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Restaurant,Gay Bar,Yoga Studio,Men's Store,Mediterranean Restaurant,Hotel,Pub
3,M5A,Coffee Shop,Bakery,Pub,Park,Theater,Breakfast Spot,Café,Restaurant,Spa,Distribution Center
4,M5B,Clothing Store,Coffee Shop,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Café,Bubble Tea Shop,Cosmetics Shop,Diner,Lingerie Store


In [52]:
Downtown_Toronto.shape

(19, 5)

## Cluster Neighborhoods

### K-means to cluster the neighborhoods within 4 clusters

In [53]:
# set number of clusters
kclusters = 4

Downtown_toronto_grouped_clustering = Downtown_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Downtown_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0],
      dtype=int32)

In [54]:
neighborhoods_venues_sorted.tail()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,M5V,Airport Service,Airport Lounge,Airport Terminal,Plane,Harbor / Marina,Boat or Ferry,Rental Car Location,Boutique,Sculpture Garden,Airport
15,M5W,Coffee Shop,Café,Italian Restaurant,Seafood Restaurant,Restaurant,Cocktail Bar,Japanese Restaurant,Hotel,Beer Bar,Breakfast Spot
16,M5X,Coffee Shop,Café,Restaurant,Hotel,Gym,American Restaurant,Asian Restaurant,Seafood Restaurant,Salad Place,Steakhouse
17,M6G,Grocery Store,Café,Park,Coffee Shop,Candy Store,Italian Restaurant,Diner,Restaurant,Athletics & Sports,Baby Store
18,M7A,Coffee Shop,Diner,Sushi Restaurant,Discount Store,Smoothie Shop,Beer Bar,Italian Restaurant,Sculpture Garden,Sandwich Place,Distribution Center


In [55]:
Downtown_Toronto.drop('Neighborhood', axis = 1, inplace = True)
Downtown_Toronto.rename(columns = {'PostalCode': 'Neighborhood'}, inplace = True)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [59]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Downtown_toronto_merged = Downtown_Toronto
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Downtown_toronto_merged = Downtown_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#Downtown_merged = Downtown_toronto_merged.set_index('Neighborhood').join(neighborhoods_venues_sorted.set_index('Neighborhood'))
#Downtown_merged = pd.merge(Downtown_toronto_merged, neighborhoods_venues_sorted)


Downtown_toronto_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Theater,Breakfast Spot,Café,Restaurant,Spa,Distribution Center
1,M7A,Downtown Toronto,43.662301,-79.389494,0,Coffee Shop,Diner,Sushi Restaurant,Discount Store,Smoothie Shop,Beer Bar,Italian Restaurant,Sculpture Garden,Sandwich Place,Distribution Center
2,M5B,Downtown Toronto,43.657162,-79.378937,0,Clothing Store,Coffee Shop,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Café,Bubble Tea Shop,Cosmetics Shop,Diner,Lingerie Store
3,M5C,Downtown Toronto,43.651494,-79.375418,0,Café,Coffee Shop,Restaurant,American Restaurant,Gastropub,Cocktail Bar,Clothing Store,Moroccan Restaurant,Cosmetics Shop,Lingerie Store
4,M5E,Downtown Toronto,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Restaurant,Cheese Shop,Seafood Restaurant,Beer Bar,Bakery,Café,Irish Pub,Diner


### Let's visualize the resulting cluster

In [60]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Downtown_toronto_merged['Latitude'], Downtown_toronto_merged['Longitude'], Downtown_toronto_merged['Neighborhood'], Downtown_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters
Now, we examine each cluster and determine the discriminating venue categories that distinguish each cluster.

In [61]:
Cluster1 = Downtown_toronto_merged.loc[Downtown_toronto_merged['Cluster Labels'] == 0, Downtown_toronto_merged.columns[[1] + list(range(4, Downtown_toronto_merged.shape[1]))]]
Cluster1

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Bakery,Pub,Park,Theater,Breakfast Spot,Café,Restaurant,Spa,Distribution Center
1,Downtown Toronto,0,Coffee Shop,Diner,Sushi Restaurant,Discount Store,Smoothie Shop,Beer Bar,Italian Restaurant,Sculpture Garden,Sandwich Place,Distribution Center
2,Downtown Toronto,0,Clothing Store,Coffee Shop,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Café,Bubble Tea Shop,Cosmetics Shop,Diner,Lingerie Store
3,Downtown Toronto,0,Café,Coffee Shop,Restaurant,American Restaurant,Gastropub,Cocktail Bar,Clothing Store,Moroccan Restaurant,Cosmetics Shop,Lingerie Store
4,Downtown Toronto,0,Coffee Shop,Cocktail Bar,Restaurant,Cheese Shop,Seafood Restaurant,Beer Bar,Bakery,Café,Irish Pub,Diner
5,Downtown Toronto,0,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Japanese Restaurant,Salad Place,Thai Restaurant,Burger Joint,Bubble Tea Shop,Department Store
7,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Gym,Hotel,Thai Restaurant,Deli / Bodega,Concert Hall,Bookstore,Steakhouse
8,Downtown Toronto,0,Coffee Shop,Aquarium,Café,Hotel,Restaurant,Sporting Goods Shop,Brewery,Fried Chicken Joint,Scenic Lookout,Park
9,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,Salad Place,Seafood Restaurant,Japanese Restaurant,Italian Restaurant,American Restaurant,Beer Bar
10,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Japanese Restaurant,Italian Restaurant,Seafood Restaurant,Cocktail Bar


In [62]:
Cluster2 = Downtown_toronto_merged.loc[Downtown_toronto_merged['Cluster Labels'] == 1, Downtown_toronto_merged.columns[[1] + list(range(4, Downtown_toronto_merged.shape[1]))]]
Cluster2

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,1,Grocery Store,Café,Park,Coffee Shop,Candy Store,Italian Restaurant,Diner,Restaurant,Athletics & Sports,Baby Store


In [63]:
Cluster3 = Downtown_toronto_merged.loc[Downtown_toronto_merged['Cluster Labels'] == 2, Downtown_toronto_merged.columns[[1] + list(range(4, Downtown_toronto_merged.shape[1]))]]
Cluster3

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,2,Park,Playground,Trail,Cosmetics Shop,Dog Run,Distribution Center,Discount Store,Diner,Dim Sum Restaurant,Dessert Shop


In [64]:
Cluster4 = Downtown_toronto_merged.loc[Downtown_toronto_merged['Cluster Labels'] == 3, Downtown_toronto_merged.columns[[1] + list(range(4, Downtown_toronto_merged.shape[1]))]]
Cluster4

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Plane,Harbor / Marina,Boat or Ferry,Rental Car Location,Boutique,Sculpture Garden,Airport


## Conclusion Analysis
As we can see from the Downtown Toronto clustering output, Cluster 1 are mostly coffee shops, Cluster 2 Grocery, Cluster 3 Park, and Cluster 4 Airport Services.