# Coursera Capstone Project Part 3
This notebook is used to cluster the neighborhoods from Toronto into 7 clusters

## 1.Import the libraries and the data

In [18]:
import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import folium
df_toronto = pd.read_csv('toronto.csv')
df_toronto = df_toronto.drop('Unnamed: 0',axis=1)

In [64]:
df_toronto.shape

(103, 5)

## 2.Create the Map with Toronto Neighborhoods

First we use Nominatim to get the latitude and longitude of Toronto

In [14]:
adress = "Toronto, Canada"
geolocator = Nominatim(user_agent = "toronto_explorer")
local = geolocator.geocode(adress)
latitude = local.latitude
longitude = local.longitude
print(latitude,longitude)

43.653963 -79.387207


Then, we create a Map of Toronto with it's neighborhoods

In [24]:
toronto = folium.Map(location=[latitude,longitude], zoom_start = 11)
for lat,long,borough,neighboor in zip(df_toronto['Latitude'],df_toronto['Longitude'],df_toronto['Borough'],df_toronto['Neighboorhood']):
    label = "{}:{}".format(borough,neighboor)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat,long],
        radius = 5,
        popup = label,
        color= 'blue',
        fill = True,
        fill_opacity=0.7,
        parse_html= False).add_to(toronto)
toronto

## 3. Using Forsquare API to explore each Neighborhood

First we set the parameters for using the API

In [None]:
## The Foursquare_Developer.json is a file in my local machine that contain My client_Id
## and the my client_secret for the Foursquare API.
with open('Foursquare_Developer.json') as fs:
    credentials = json.load(fs)
CLIENT_ID = credentials["Client ID"] 
CLIENT_SECRET = credentials["Client SECRET"] 
VERSION = '20180605'
RADIUS = 600
LIMIT = 150

The function below, makes requests to the API to get the nearby venues for each latitude
and longitude given and return a dataframe with the data.

In [41]:
def getNearbyVenues(postals, names, latitudes, longitudes, radius= 700):
    
    venues_list=[]
    for postal, name, lat, lng in zip(postals, names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code',
                             'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Calling the function above for each neighborhood in our datframe to get the venues

In [42]:
toronto_venue = getNearbyVenues(postals =df_toronto['PostalCode'],
                                names = df_toronto['Neighboorhood'], 
                               latitudes= df_toronto['Latitude'],
                               longitudes=df_toronto['Longitude'])
toronto_venue.head()

Parkwoods
Victoria Village
Harbourfront, Regent Park
Lawrence Heights, Lawrence Manor
Queen's Park
Islington Avenue
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
East Birchmount Park, Ionview, Kennedy Park
Bayview Village
CFB Toronto, Downsview East
The D

Let's see how many venues we got

In [43]:
toronto_venue.groupby('Postal Code').count()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M1B,6,6,6,6,6,6,6
M1C,4,4,4,4,4,4,4
M1E,20,20,20,20,20,20,20
M1G,6,6,6,6,6,6,6
M1H,18,18,18,18,18,18,18
M1J,6,6,6,6,6,6,6
M1K,14,14,14,14,14,14,14
M1L,16,16,16,16,16,16,16
M1M,4,4,4,4,4,4,4
M1N,9,9,9,9,9,9,9


## 4. Let's Explore the Neighborhoods

Let's make one column per Venue Category

In [58]:
toronto_onehot = pd.get_dummies(toronto_venue['Venue Category'],prefix="",prefix_sep="")
toronto_onehot['Neighbourhood'] = toronto_venue['Neighborhood']
toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]]+list(toronto_onehot.columns[0:-1])]
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Les's group rows by neighboorhood, taking the mean of the frequency of each venue category

In [61]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
toronto_grouped.shape

(102, 321)

The function below sort row based in the frequency of the venue's category and return the 'n' top venues. 

In [65]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Let's create a new dataframe with the 10 most popular venues of each Neighborhood.

In [77]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))


neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,American Restaurant,Steakhouse,Theater,Sushi Restaurant,Bar,Gastropub,Restaurant
1,Agincourt,Skating Rink,Badminton Court,Coffee Shop,Breakfast Spot,Pool Hall,Shanghai Restaurant,Lounge,Motorcycle Shop,Yoga Studio,Discount Store
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Pharmacy,Pizza Place,Fast Food Restaurant,Chinese Restaurant,Noodle House,BBQ Joint,Gym,Park,Malay Restaurant,Shop & Service
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Hardware Store,Sandwich Place,Liquor Store,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Pharmacy,Empanada Restaurant
4,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Gas Station,Pharmacy,Gym,Sandwich Place,Pool,Athletics & Sports,Pub,Convenience Store


## 5. Let's Cluster the Neighborhoods Using Kmeans

we are going to use K-Means algorithm to cluster our neighborhoods

In [69]:
from sklearn.cluster import KMeans
k = 7
toronto_clusters = toronto_grouped.drop('Neighbourhood',axis=1)

In [72]:
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clusters)
kmeans.labels_

array([2, 2, 1, 0, 0, 2, 2, 2, 2, 1, 1, 2, 1, 5, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 1, 2, 2, 6, 2,
       2, 2, 5, 0, 1, 2, 2, 2, 2, 1, 1, 1, 6, 1, 2, 1, 2, 2, 1, 2, 1, 2,
       2, 1, 5, 5, 2, 4, 2, 2, 1, 2, 5, 1, 3, 2, 2, 3, 5, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 0, 1, 2, 0, 5, 1, 1, 5], dtype=int32)

In [78]:
neighborhoods_venues_sorted.insert(0,'Cluster Labels',kmeans.labels_)

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,2,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant,American Restaurant,Steakhouse,Theater,Sushi Restaurant,Bar,Gastropub,Restaurant
1,2,Agincourt,Skating Rink,Badminton Court,Coffee Shop,Breakfast Spot,Pool Hall,Shanghai Restaurant,Lounge,Motorcycle Shop,Yoga Studio,Discount Store
2,1,"Agincourt North, L'Amoreaux East, Milliken, St...",Pharmacy,Pizza Place,Fast Food Restaurant,Chinese Restaurant,Noodle House,BBQ Joint,Gym,Park,Malay Restaurant,Shop & Service
3,0,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Hardware Store,Sandwich Place,Liquor Store,Beer Store,Fried Chicken Joint,Fast Food Restaurant,Pharmacy,Empanada Restaurant
4,0,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Gas Station,Pharmacy,Gym,Sandwich Place,Pool,Athletics & Sports,Pub,Convenience Store


In [80]:
toronto_merged = df_toronto

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighboorhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighboorhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Pool,Fast Food Restaurant,Pet Store,Park,Food & Drink Shop,Yoga Studio,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner
1,M4A,North York,Victoria Village,43.725882,-79.315572,2.0,Sporting Goods Shop,Hockey Arena,Coffee Shop,Portuguese Restaurant,Park,Café,Playground,Greek Restaurant,Discount Store,Deli / Bodega
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,2.0,Coffee Shop,Café,Restaurant,Park,Bakery,Pub,Theater,Mexican Restaurant,Dance Studio,Breakfast Spot
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,2.0,Clothing Store,Vietnamese Restaurant,Coffee Shop,Furniture / Home Store,Accessories Store,Cheese Shop,Bowling Alley,Boutique,Fast Food Restaurant,Miscellaneous Shop
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,2.0,Coffee Shop,Sandwich Place,Italian Restaurant,Gym,Café,Yoga Studio,Gastropub,Falafel Restaurant,Bubble Tea Shop,Burger Joint


#### Let's explore our clusters

#### Cluster 0 

In [87]:
toronto_merged.loc[toronto_merged['Cluster Labels']==0.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
10,M6B,North York,Glencairn
11,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
46,M3L,North York,Downsview West
60,M3N,North York,Downsview Northwest
63,M6N,York,"The Junction North, Runnymede"
70,M9P,Etobicoke,Westmount
72,M2R,North York,Willowdale West
89,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
93,M8W,Etobicoke,"Alderwood, Long Branch"


#### Cluster 1

In [88]:
toronto_merged.loc[toronto_merged['Cluster Labels']==1.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
0,M3A,North York,Parkwoods
5,M9A,Etobicoke,Islington Avenue
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
14,M4C,East York,Woodbine Heights
17,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
21,M6E,York,Caledonia-Fairbanks
27,M2H,North York,Hillcrest Village
31,M6H,West Toronto,"Dovercourt Village, Dufferin"


#### Cluster 2

In [89]:
toronto_merged.loc[toronto_merged['Cluster Labels']==2.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
7,M3B,North York,Don Mills North
9,M5B,Downtown Toronto,"Ryerson, Garden District"
13,M3C,North York,"Flemingdon Park, Don Mills South"
15,M5C,Downtown Toronto,St. James Town
16,M6C,York,Humewood-Cedarvale
19,M4E,East Toronto,The Beaches


#### Cluster 3

In [90]:
toronto_merged.loc[toronto_merged['Cluster Labels']==3.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
6,M1B,Scarborough,"Rouge, Malvern"
32,M1J,Scarborough,Scarborough Village


#### Cluster 4

In [91]:
toronto_merged.loc[toronto_merged['Cluster Labels']==4.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
94,M9W,Etobicoke,Northwest


#### Cluster 5

In [92]:
toronto_merged.loc[toronto_merged['Cluster Labels']==5.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
22,M1G,Scarborough,Woburn
40,M3K,North York,"CFB Toronto, Downsview East"
45,M2L,North York,"Silver Hills, York Mills"
52,M2M,North York,"Newtonbrook, Willowdale"
66,M2P,North York,York Mills West
68,M5P,Central Toronto,"Forest Hill North, Forest Hill West"
83,M4T,Central Toronto,"Moore Park, Summerhill East"
91,M4W,Downtown Toronto,Rosedale


#### Cluster 6

In [93]:
toronto_merged.loc[toronto_merged['Cluster Labels']==6.0,toronto_merged.columns[0:3]]

Unnamed: 0,PostalCode,Borough,Neighboorhood
57,M9M,North York,"Emery, Humberlea"
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


## 6.Map

Creating a map with the labeled neighborhoods.

In [128]:
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighboorhood'], toronto_merged['Cluster Labels']):
    if( np.isnan(cluster)): cluster = -1
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters