# Coursera Capstone 
## Segmentation and clustering neighborhoods in toronto // Part 3

In [32]:
import pandas as pd 
import numpy as np 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from geopy.geocoders import Nominatim
import folium
import json 
import requests 
from collections import deque

### Read old data 

In [2]:
data = pd.read_csv("FullData.csv")
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [3]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(data['Borough'].unique()),
        data.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


##### Get the latitude and longitude values 

In [46]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of city Toronto are {}, {}.'.format(latitude, longitude))

The coordinates of city Toronto are 43.6534817, -79.3839347.


  This is separate from the ipykernel package so we can avoid doing imports until


### Create map of Toronto using latitude and longitude values

In [48]:
mapToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

neighborhoods = data

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'],
                                           neighborhoods['Longitude'],
                                           neighborhoods['Borough'],
                                           neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(mapToronto)  
    
mapToronto


#### Define Foursquare Credentials and Version

In [49]:
CLIENT_ID = 'URHPUSEWESRNWDYSI3SEPC2RYQRSXJCSUZK2OED4NZUUFQCD' 
CLIENT_SECRET = 'O1RDWQU1AGZ4BUIZVCJEE5FWC411PVSLSFB5MXTH4CVJEXA2' 
VERSION = '20180605' # Foursquare API version

### Explore Neighborhoods 

In [50]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [51]:
Toronto_venues = getNearbyVenues(data.Neighborhood,
                            data.Latitude,
                            data.Longitude)

In [52]:
print(Toronto_venues.shape)
Toronto_venues.head()

(2173, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [53]:
Toronto_venues.groupby("Neighborhood").Venue.count().sort_values(ascending=False).head()

Neighborhood
Toronto Dominion Centre, Design Exchange             100
Commerce Court, Victoria Hotel                       100
First Canadian Place, Underground city               100
Garden District, Ryerson                             100
Harbourfront East, Union Station, Toronto Islands    100
Name: Venue, dtype: int64

In [54]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 267 uniques categories.


### Analyze Each Neighborhood

In [57]:
Toronto_encoded = pd.get_dummies(Toronto_venues["Venue Category"],
                             prefix = "",
                             prefix_sep = "")

Toronto_encoded["Neighborhood"] = Toronto_venues["Neighborhood"]


nindex = list(Toronto_encoded.columns).index("Neighborhood")
cols = deque(Toronto_encoded.columns)
cols.rotate(-nindex)
cols = list(cols)
Toronto_encoded = Toronto_encoded[cols]

Toronto_encoded.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Outdoor Sculpture,...,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Motel,Movie Theater,Museum,Music Venue
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
Toronto_encoded.shape

(2173, 267)

In [59]:
#Find average number of venue categories per neighborhood
Toronto_grouped = Toronto_encoded.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Outdoor Sculpture,...,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Motel,Movie Theater,Museum,Music Venue
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
Toronto_grouped.shape

(95, 267)

Let's write a function to sort the venues in descending order.

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [61]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Breakfast Spot,Skating Rink,Latin American Restaurant,Music Venue,Antique Shop,Airport Service,Airport Terminal,American Restaurant,Aquarium
1,"Alderwood, Long Branch",Pizza Place,Pool,Pharmacy,Sandwich Place,Dance Studio,Athletics & Sports,Pub,Gym,Skating Rink,Coffee Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Ice Cream Shop,Pizza Place,Video Store,Gas Station,Sushi Restaurant,Supermarket,Diner,Shopping Mall
3,Bayview Village,Bank,Chinese Restaurant,Café,Japanese Restaurant,Art Gallery,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Restaurant,Comfort Food Restaurant,Thai Restaurant,Café,Sushi Restaurant,Fast Food Restaurant,Greek Restaurant


Run *k*-means to cluster the neighborhood into 5 clusters.

In [64]:
Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print(kmeans.labels_[0:10])
print(kmeans.labels_.shape)

[1 1 1 1 1 1 1 1 1 1]
(95,)


In [65]:
Toronto_grouped["Cluster Labels"] = kmeans.labels_

In [66]:
# add clustering labels
Toronto_combined = data.merge(Toronto_grouped, left_on = "Neighborhood", right_on = "Neighborhood", how = "outer")

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_combined = Toronto_combined.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_combined["Cluster Labels"] = Toronto_combined["Cluster Labels"].fillna(5).astype("int")

Toronto_combined.head() # check the last columns!


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,New American Restaurant,Nightclub,Noodle House,Office,Opera House,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,0.0,0.0,0.0,0.0,...,Food & Drink Shop,Park,Music Venue,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,0.0,0.0,0.0,0.0,...,Coffee Shop,Hockey Arena,Financial or Legal Service,Portuguese Restaurant,Intersection,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Art Gallery
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,0.0,0.0,0.0,0.0,...,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Mexican Restaurant,Theater,Café,Electronics Store,Historic Site
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0.0,0.0,0.0,0.0,0.0,...,Furniture / Home Store,Clothing Store,Miscellaneous Shop,Accessories Store,Boutique,Women's Store,Event Space,Gift Shop,Vietnamese Restaurant,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,0.0,0.0,0.0,0.0,...,Coffee Shop,Diner,Theater,College Auditorium,Distribution Center,Arts & Crafts Store,Fried Chicken Joint,Sandwich Place,Bank,Bar


Let's visualize the resulting clusters

In [68]:
# now create map
mapClusters = folium.Map(location=[latitude, longitude], zoom_start=11)

kclusters = kclusters + 1

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_combined['Latitude'],
                                  Toronto_combined['Longitude'],
                                  Toronto_combined['Neighborhood'],
                                  Toronto_combined['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(mapClusters)
       
mapClusters

### Examine Clusters

#### Cluster 1

In [69]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 0, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Golf Course,Music Venue,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Airport


#### Cluster 2

In [70]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 1, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Coffee Shop,Hockey Arena,Financial or Legal Service,Portuguese Restaurant,Intersection,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Art Gallery
2,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Mexican Restaurant,Theater,Café,Electronics Store,Historic Site
3,Furniture / Home Store,Clothing Store,Miscellaneous Shop,Accessories Store,Boutique,Women's Store,Event Space,Gift Shop,Vietnamese Restaurant,Coffee Shop
4,Coffee Shop,Diner,Theater,College Auditorium,Distribution Center,Arts & Crafts Store,Fried Chicken Joint,Sandwich Place,Bank,Bar
6,Construction & Landscaping,Fast Food Restaurant,Aquarium,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Music Venue,Bagel Shop


#### Cluster 3

In [71]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 2, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Food & Drink Shop,Park,Music Venue,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium
21,Park,Market,Women's Store,Music Venue,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium
65,Park,Music Venue,Art Gallery,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Arts & Crafts Store,Airport Gate
91,Park,Trail,Playground,Airport Gate,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery


#### Cluster 4

In [72]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 3, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Bar,Music Venue,Airport Gate,BBQ Joint,Auto Workshop,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Art Gallery


#### Cluster 5

In [73]:
Toronto_combined.loc[Toronto_combined['Cluster Labels'] == 4, 
                     "1st Most Common Venue":"10th Most Common Venue"].head()

Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
58,Baseball Field,Music Venue,Airport Gate,BBQ Joint,Auto Workshop,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Art Gallery
101,Baseball Field,Music Venue,Airport Gate,BBQ Joint,Auto Workshop,Auto Garage,Athletics & Sports,Asian Restaurant,Arts & Crafts Store,Art Gallery


### End part 3