# Week 3 Assignment Clustering

## Import Library

In [1]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json, lxml
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
# import folium # map rendering library
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
import folium

## Part - 1 Data Import and Cleaning

### Import using pandas or Beautiful Soup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Easy way with pandas
dfs = pd.read_html(url)
#dfs[0].shape

In [3]:
#using beatifulsoup
source = requests.get(url).text
soup = BeautifulSoup(source)

#print(soup.prettify())
table = soup.find('table')
table_rows = table.find_all('tr')
#table_rows

#create features
columns = ['PostalCode', 'Borough', 'Neighbourhood']
data = dict({key:[]*len(columns) for key in columns})

for tr in table_rows:
    for i,column in zip(tr.find_all('td'), columns):
        i = i.text
        i = i.replace('\n','')
        data[column].append(i)

dfraw = pd.DataFrame.from_dict(data=data)[columns]
print(dfraw.shape)
dfraw.head(10)

(287, 3)


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


### Cleaning Data
* Dataframe still contain 'Not Assigned' Borough
* Cleaning Dataframe 
    - Drop 'Not assigned' Borough
    - Replace 'Not assigned' Neigbhbourhood with Borough valus

In [4]:
# Remove and replace
dfraw = dfraw[dfraw['Borough'] != 'Not assigned'].reset_index(drop = True)
print('After cheaning, Shape is: ',dfraw.shape)
print('Number of rows where Neighbourhood is "Not assigned" but borough has value: ', 
      dfraw[dfraw['Neighbourhood'] == 'Not assigned'].shape[0])

a,b,c = [],[],[]
for i in range(0,len(dfraw)):
    a.append(dfraw['PostalCode'][i])
    b.append(dfraw['Borough'][i])
    if dfraw['Neighbourhood'][i] == 'Not assigned':
        c.append(dfraw['Borough'][i])
    else:
        c.append(dfraw['Neighbourhood'][i])

# Rewrite Dataframe after cleaning not assigned
dfcl1 = pd.DataFrame({'PostalCode':a,'Borough' : b, 'Neighbourhood' : c})[columns]

dfcl1.head(15)

After cheaning, Shape is:  (210, 3)
Number of rows where Neighbourhood is "Not assigned" but borough has value:  1


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


* Group based on Postal Code and Borough

In [5]:
# Group by postal code and boroug

postcodes = dfcl1['PostalCode'].values
boroughs = dfcl1['Borough'].values
neighs = dfcl1['Neighbourhood'].values


#create a dictionary with keys as Postcode and Borough, keys of dictioaries are unique
dic = dict({(key1,key2): [] for key1, key2 in zip(postcodes, boroughs)})
print('Number of keys in the dictionary are: ', len(dic.keys()))

#filling the values of keys of dictionary
for postcode, borough, neigh in zip(postcodes,boroughs, neighs):
    key = (postcode, borough)
    dic[key].append(neigh)

df = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighbourhood'])
for key, value in dic.items():
    postcode, borough, neig = key[0], key[1], value
    neig = ', '.join(neig)
    df = df.append({'Postal Code': postcode,
                     'Borough': borough,
                     'Neighbourhood': neig}, ignore_index = True)
print('Shape of final data is: ', df.shape)
df.head(15)

Number of keys in the dictionary are:  103
Shape of final data is:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Downtown Toronto,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Importing Toronto Postal Code
Due to `geocoder` keep return `None`, Data from csv is used. Link provided broken, another student provide data on their Github. Data retrieved by webcrawling on link provided by other student

In [6]:
url2 = 'https://github.com/Juankboards/toronto_neighborhood_clustering/blob/master/Geospatial_Coordinates.csv'

dfs = pd.read_html(url2)

dfPC = dfs[0]
dfPC.drop('Unnamed: 0', axis = 1, inplace = True)
dfPC.to_csv('C:\\Users\\admin\\Documents\\Kintha\\Python\\Capstone\\PostalCode.csv', index=False)
dfPC.shape
dfPC

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


## Part 2 - Merging two sets of DataFrame

In [7]:
df = pd.merge(df, dfPC, how='inner', on=['Postal Code'])
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


## Part 3 - Exploring Toronto and Clustering

In [8]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create Map of Toronto Neighbourhood

In [9]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
        #,
        #parse_html=True).add_to(map_toronto)  
    
map_toronto

Counting number of Borough

In [10]:
df['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           11
Central Toronto      9
West Toronto         6
York                 5
East Toronto         5
East York            5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

Lets explore the largest number of Toronto Neighbourhood, in a Borough

In [11]:
ny = df[df['Borough'] == 'North York'].reset_index(drop=True)
ny.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
3,M3B,North York,Don Mills North,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073
5,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.340923
6,M2H,North York,Hillcrest Village,43.803762,-79.363452
7,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259
8,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
9,M3J,North York,"Northwood Park, York University",43.76798,-79.487262


Lets Visualize North York

In [12]:
# create map of North York using latitude and longitude values
address = 'North York, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

map_NY = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(ny['Latitude'], ny['Longitude'], ny['Borough'], ny['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_NY)  
        #,
        #parse_html=True).add_to(map_toronto)  
    
map_NY

The geograpical coordinate of Toronto are 43.7543263, -79.4491169663959.


Lets Explore North York

In [13]:
CLIENT_ID = 'BXLLGXFP5RTLUTYT4NG1GHZ40EXKGBL4HKAIPLLC1RERZ04O' # your Foursquare ID
CLIENT_SECRET = 'ZH4OTECGFUJMRK4JG2C22THKR53WSYXHY44HSVEV1IAGRNYL' # your Foursquare Secret
VERSION = '20190605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BXLLGXFP5RTLUTYT4NG1GHZ40EXKGBL4HKAIPLLC1RERZ04O
CLIENT_SECRET:ZH4OTECGFUJMRK4JG2C22THKR53WSYXHY44HSVEV1IAGRNYL


Choose one of the Neighbourhood

In [14]:
ny.loc[2,'Neighbourhood'].split(',')[0]

'Lawrence Heights'

Get Neighbourhood Coordinate. Because the selected postal code consist of two neighbourhood. Select one only and define new coordinate with geopy

In [15]:
address = '{}'.format(ny.loc[2,'Neighbourhood'].split(',')[0])

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
LH_latitude = location.latitude
LH_longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(ny.loc[2,'Neighbourhood'].split(',')[0],latitude, longitude))

The geograpical coordinate of Lawrence Heights are 43.7543263, -79.4491169663959.


#### Now, let's get the top 100 venues that are in Lawrence Height within a radius of 500 meters.

In [16]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    LH_latitude, 
    LH_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=BXLLGXFP5RTLUTYT4NG1GHZ40EXKGBL4HKAIPLLC1RERZ04O&client_secret=ZH4OTECGFUJMRK4JG2C22THKR53WSYXHY44HSVEV1IAGRNYL&v=20190605&ll=43.7227784,-79.4509332&radius=500&limit=100'

Send the GET request and examine the resutls

In [17]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e03d83d216785001bd974d4'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 92,
  'suggestedBounds': {'ne': {'lat': 43.727278404500005,
    'lng': -79.4447181044291},
   'sw': {'lat': 43.7182783955, 'lng': -79.45714829557089}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50a856a5e4b0bfc9165ef55c',
       'name': 'Ted Baker London',
       'location': {'address': '3401 Dufferin St.',
        'crossStreet': 'in Yorkdale Shopping Centre',
        'lat': 43.72451893248142,
        'lng': -79.45270961233477,
        'labeledLatLngs': [{'label': '

By looking at the json output, we can see that items key and we get useful information using results['response']['groups'][0]['items']

In [18]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [19]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))
nearby_venues.head()

92 venues were returned by Foursquare.


Unnamed: 0,name,categories,lat,lng
0,Ted Baker London,Clothing Store,43.724519,-79.45271
1,Holt Renfrew,Clothing Store,43.724625,-79.451664
2,Apple Yorkdale,Electronics Store,43.724262,-79.453103
3,The Lego Store,Toy / Game Store,43.725146,-79.452974
4,Tiffany & Co.,Jewelry Store,43.724858,-79.452096


In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
# type your answer here

ny_venues = getNearbyVenues(names=ny['Neighbourhood'],
                                   latitudes=ny['Latitude'],
                                   longitudes=ny['Longitude']
                                  )



Parkwoods
Victoria Village
Lawrence Heights, Lawrence Manor
Don Mills North
Glencairn
Flemingdon Park, Don Mills South
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
CFB Toronto, Downsview East
Silver Hills, York Mills
Downsview West
Downsview, North Park, Upwood Park
Humber Summit
Newtonbrook, Willowdale
Downsview Central
Bedford Park, Lawrence Manor East
Emery, Humberlea
Willowdale South
Downsview Northwest
York Mills West
Willowdale West


In [22]:
print(ny_venues.shape)
ny_venues.head()


(242, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [23]:
print('Venues returned for each neighbourhood: ')
ny_venues.groupby('Neighbourhood')['Venue'].count()

Venues returned for each neighbourhood: 


Neighbourhood
Bathurst Manor, Downsview North, Wilson Heights    20
Bayview Village                                     4
Bedford Park, Lawrence Manor East                  23
CFB Toronto, Downsview East                         2
Don Mills North                                     6
Downsview Central                                   4
Downsview Northwest                                 6
Downsview West                                      6
Downsview, North Park, Upwood Park                  4
Emery, Humberlea                                    1
Fairview, Henry Farm, Oriole                       60
Flemingdon Park, Don Mills South                   22
Glencairn                                           4
Hillcrest Village                                   5
Humber Summit                                       2
Lawrence Heights, Lawrence Manor                   13
Northwood Park, York University                     6
Parkwoods                                           2
Silver Hills, 

Lets analysed each neighbourhood

In [24]:
# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighbourhood'] = ny_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

ny_onehot.head(15)

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,"Lawrence Heights, Lawrence Manor",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,"Lawrence Heights, Lawrence Manor",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"Lawrence Heights, Lawrence Manor",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


And let's examine the new dataframe size.

In [25]:
ny_grouped = ny_onehot.groupby('Neighbourhood').mean().reset_index()
ny_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,...,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store
0,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,...,0.05,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,...,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CFB Toronto, Downsview East",0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Downsview Central,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Downsview Northwest,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Downsview West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Downsview, North Park, Upwood Park",0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Emery, Humberlea",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's confirm the new size

In [26]:
ny_grouped.shape

(23, 103)

## Part 3 - Clustering Based on top 5 most common venues

#### Let's print each neighborhood along with the top 5 most common venues

In [27]:
num_top_venues = 5

for hood in ny_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = ny_grouped[ny_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Downsview North, Wilson Heights----
                venue  freq
0         Coffee Shop  0.10
1  Frozen Yogurt Shop  0.05
2         Bridal Shop  0.05
3            Pharmacy  0.05
4       Deli / Bodega  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Bank  0.25
2                 Café  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park, Lawrence Manor East----
                  venue  freq
0             Juice Bar  0.09
1           Coffee Shop  0.09
2  Fast Food Restaurant  0.09
3    Italian Restaurant  0.09
4                   Pub  0.04


----CFB Toronto, Downsview East----
         venue  freq
0      Airport   0.5
1         Park   0.5
2  Pizza Place   0.0
3     Pharmacy   0.0
4    Pet Store   0.0


----Don Mills North----
                  venue  freq
0  Gym / Fitness Center  0.17
1  Caribbean Restaurant  0.17
2   Japanese Restaurant  0.17
3        Baseball Field  0.17
4      Baske

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = ny_grouped['Neighbourhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Gas Station,Shopping Mall,Middle Eastern Restaurant,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Diner,Restaurant
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,"Bedford Park, Lawrence Manor East",Fast Food Restaurant,Italian Restaurant,Juice Bar,Coffee Shop,Grocery Store,Sandwich Place,Greek Restaurant,Indian Restaurant,Liquor Store,Comfort Food Restaurant
3,"CFB Toronto, Downsview East",Airport,Park,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store
4,Don Mills North,Japanese Restaurant,Gym / Fitness Center,Caribbean Restaurant,Café,Baseball Field,Basketball Court,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega


Run *k*-means to cluster the neighborhood into 5 clusters.

In [30]:
# set number of clusters
kclusters = 5

ny_grouped_clustering = ny_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 4, 1, 1, 1, 1, 1, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [31]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ny_merged = ny

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

ny_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Food & Drink Shop,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Pizza Place,Hockey Arena,Portuguese Restaurant,Intersection,Coffee Shop,Women's Store,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store
2,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,1.0,Clothing Store,Furniture / Home Store,Women's Store,Miscellaneous Shop,Athletics & Sports,Boutique,Coffee Shop,Gift Shop,Accessories Store,Vietnamese Restaurant
3,M3B,North York,Don Mills North,43.745906,-79.352188,1.0,Japanese Restaurant,Gym / Fitness Center,Caribbean Restaurant,Café,Baseball Field,Basketball Court,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega
4,M6B,North York,Glencairn,43.709577,-79.445073,1.0,Japanese Restaurant,Pub,Metro Station,Park,Frozen Yogurt Shop,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Gas Station


Finally, let's visualize the resulting clusters

In [32]:
ny_merged.dropna(axis=0, how='any', inplace=True)
ny_merged['Cluster Labels']

0     0.0
1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     1.0
7     1.0
8     1.0
9     1.0
10    1.0
11    4.0
12    0.0
13    1.0
14    1.0
15    3.0
17    1.0
18    1.0
19    2.0
20    1.0
21    1.0
22    1.0
23    1.0
Name: Cluster Labels, dtype: float64

In [33]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighbourhood'],ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters
#### Cluster 1

In [34]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,0.0,Park,Food & Drink Shop,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store
12,North York,0.0,Park,Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store


#### Cluster 2

In [35]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,1.0,Pizza Place,Hockey Arena,Portuguese Restaurant,Intersection,Coffee Shop,Women's Store,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store
2,North York,1.0,Clothing Store,Furniture / Home Store,Women's Store,Miscellaneous Shop,Athletics & Sports,Boutique,Coffee Shop,Gift Shop,Accessories Store,Vietnamese Restaurant
3,North York,1.0,Japanese Restaurant,Gym / Fitness Center,Caribbean Restaurant,Café,Baseball Field,Basketball Court,Electronics Store,Convenience Store,Cosmetics Shop,Deli / Bodega
4,North York,1.0,Japanese Restaurant,Pub,Metro Station,Park,Frozen Yogurt Shop,Dim Sum Restaurant,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Gas Station
5,North York,1.0,Beer Store,Coffee Shop,Gym,Asian Restaurant,Sandwich Place,Clothing Store,Chinese Restaurant,Café,Dim Sum Restaurant,Japanese Restaurant
6,North York,1.0,Golf Course,Fast Food Restaurant,Pool,Mediterranean Restaurant,Dog Run,Frozen Yogurt Shop,Dim Sum Restaurant,Concert Hall,Gift Shop,Construction & Landscaping
7,North York,1.0,Coffee Shop,Gas Station,Shopping Mall,Middle Eastern Restaurant,Pharmacy,Pizza Place,Deli / Bodega,Bridal Shop,Diner,Restaurant
8,North York,1.0,Clothing Store,Fast Food Restaurant,Coffee Shop,Japanese Restaurant,Women's Store,Juice Bar,Food Court,Bakery,Movie Theater,Burger Joint
9,North York,1.0,Coffee Shop,Miscellaneous Shop,Caribbean Restaurant,Metro Station,Bar,Massage Studio,Discount Store,Construction & Landscaping,Convenience Store,Cosmetics Shop
10,North York,1.0,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega


#### Cluster 3

In [36]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,North York,2.0,Baseball Field,Women's Store,Electronics Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dim Sum Restaurant


#### Cluster 4

In [37]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,North York,3.0,Empanada Restaurant,Shopping Mall,Women's Store,Discount Store,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega


#### Cluster 5

In [38]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,North York,4.0,Airport,Park,Women's Store,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store
