<h1 align="center">Applied Data Science Capstone</h1> 
<h3 align="center">Assignment 2 (Part 3)</h3>
<h4 align="center">Segmenting and Clustering Neighborhoods in Toronto</h4>

In [32]:
# Install dependencies
# !pip install geopy
# !pip install folium

In [33]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import json
from geopy.geocoders import Nominatim # Address -> (Latitude,Longitude) 
import requests
import folium
print('All Libraries imported!')

All Libraries imported!


In [34]:
url = "https://raw.githubusercontent.com/SoumyadeepB/Coursera_Capstone/master/df_toronto_loc.csv"
neighborhoods = pd.read_csv(url,index_col=0)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


#### Find how many boroughs and neighborhoods

In [35]:
print("Boroughs: ",len(neighborhoods['Borough'].unique()))
print("Neighborhoods: ",neighborhoods.shape[0])

Boroughs:  10
Neighborhoods:  103


#### Using the geopy library to get the co-ordinates(latitude and longitude) of Toronto

In [36]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Co-ordinates of Toronto:  ({}, {})'.format(latitude, longitude))

Co-ordinates of Toronto:  (43.6534817, -79.3839347)


#### Visualizing the neighbourhoods and boroughs of Toronto

In [37]:
# create map of Toronto using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='yellow',
        fill_opacity=0.5,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

#### Using Foursquare API to get data regarding the nearby locations

In [62]:
CLIENT_ID = '' #Foursquare ID
CLIENT_SECRET = '' #Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 50

### Explore Neighborhoods in Toronto


In [46]:
def getVenues(names, latitudes, longitudes, radius=300):
    venue_list=[]
    results=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(venue_list)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION,lat, lng, radius, LIMIT)
            
        # GET request
        req =  requests.get(url).json()
        print(req)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        if results:
            venue_list.append([(name,v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    
    venues = pd.DataFrame([item for venue_list in venue_list for item in venue_list])
    venues.columns = ['Neighborhood','Venue', 'Latitude', 'Longitude', 'Category']
    
    return(venues)

#### Create a DataFrame to contain all nearby venues 

In [None]:
neighborhood_venues = getVenues(names=neighborhoods['Neighborhood'],latitudes=neighborhoods['Latitude'],longitudes=neighborhoods['Longitude'])

In [48]:
neighborhood_venues.head(10)

Unnamed: 0,Neighborhood,Venue,Latitude,Longitude,Category
0,Parkwoods,Brookbanks Park,43.751976,-79.33214,Park
1,Victoria Village,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
2,Victoria Village,Tim Hortons,43.725517,-79.313103,Coffee Shop
3,Victoria Village,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
5,Victoria Village,Pizza Nova,43.725824,-79.31286,Pizza Place
6,"Regent Park , Harbourfront",Roselle Desserts,43.653447,-79.362017,Bakery
7,"Regent Park , Harbourfront",Tandem Coffee,43.653559,-79.361809,Coffee Shop
8,"Regent Park , Harbourfront",Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
9,"Regent Park , Harbourfront",Body Blitz Spa East,43.654735,-79.359874,Spa


#### Number of venues returned for each neighborhood:

In [49]:
neighborhood_venues.groupby('Neighborhood').count()['Venue'].to_frame()

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Agincourt,2
"Alderwood , Long Branch",5
"Bathurst Manor , Wilson Heights , Downsview North",14
"Bedford Park , Lawrence Manor East",14
Berczy Park,7
...,...
Weston,2
Willowdale,1
Woburn,1
Woodbine Heights,1


#### Number of Unique categories:

In [50]:
print('The venues have {} uniques categories.'.format(len(neighborhood_venues['Category'].unique())))

The venues have 204 uniques categories.


##  Analyzing each Neighborhood in Toronto

In [51]:
# Categories are one-hot encoded
onehot = pd.get_dummies(neighborhood_venues[['Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhood'] = neighborhood_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,...,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
# Examine the new data frame
onehot.shape

(1038, 204)

#### Grouping rows by neighborhood

In [53]:
grp_neighborhoods = onehot.groupby('Neighborhood').mean().reset_index()
grp_neighborhoods

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arepa Restaurant,Art Gallery,...,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood , Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor , Wilson Heights , Downsview No...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0
3,"Bedford Park , Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,Weston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
77,Willowdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
78,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
79,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# Function to return most common venues:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Displaying the top 10 venues for each neighborhood:

In [55]:
num_top_venues = 10

# create columns according to number of top venues
columns = ['Neighborhood']
for idx in np.arange(num_top_venues):
    columns.append('Common Venue {}'.format(idx+1))

    
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = grp_neighborhoods['Neighborhood']

for ind in np.arange(grp_neighborhoods.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grp_neighborhoods.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,Common Venue 1,Common Venue 2,Common Venue 3,Common Venue 4,Common Venue 5,Common Venue 6,Common Venue 7,Common Venue 8,Common Venue 9,Common Venue 10
0,Agincourt,Latin American Restaurant,Breakfast Spot,Distribution Center,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Dumpling Restaurant
1,"Alderwood , Long Branch",Gym,Pharmacy,Coffee Shop,Pizza Place,Pub,Women's Store,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
2,"Bathurst Manor , Wilson Heights , Downsview No...",Coffee Shop,Middle Eastern Restaurant,Fried Chicken Joint,Shopping Mall,Sandwich Place,Ice Cream Shop,Video Store,Supermarket,Restaurant,Gas Station
3,"Bedford Park , Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Comfort Food Restaurant,Thai Restaurant,Juice Bar,Fast Food Restaurant,Restaurant,Sushi Restaurant,Pub
4,Berczy Park,Concert Hall,Liquor Store,Coffee Shop,Italian Restaurant,Beer Bar,Restaurant,Breakfast Spot,Dog Run,Farmers Market,Falafel Restaurant
...,...,...,...,...,...,...,...,...,...,...,...
76,Weston,Park,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Dumpling Restaurant,Donut Shop,Dog Run
77,Willowdale,Coffee Shop,Women's Store,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Dumpling Restaurant,Donut Shop
78,Woburn,Korean Restaurant,Women's Store,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Dumpling Restaurant
79,Woodbine Heights,Beer Store,Women's Store,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Dumpling Restaurant


## Clustering Neighborhoods
Using *k*-means to cluster the neighborhood into 6 clusters.

In [56]:
# set number of clusters
kclusters = 6
grp_neighborhoods_clustered = grp_neighborhoods.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grp_neighborhoods_clustered)

In [57]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'ClusterLabel', kmeans.labels_)
merged = neighborhoods
# merge grp_neighborhoods with toronto_data to add latitude/longitude for each neighborhood
merged = merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
merged= merged.dropna(subset=['ClusterLabel'])


### Obtaining the number of neighborhoods in each Cluster

In [58]:
clusterCounts=merged['ClusterLabel'].value_counts().to_frame().reset_index()
clusterCounts.columns=['ClusterLabel','Count']
clusterCounts['ClusterLabel']=clusterCounts['ClusterLabel'].astype(int)
clusterCounts

Unnamed: 0,ClusterLabel,Count
0,3,71
1,2,7
2,1,4
3,0,2
4,4,1
5,5,1


### Visualizing the resulting clusters


In [60]:
colormap=['cyan','red','blue','yellow','green','orange']
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Neighborhood'], merged['ClusterLabel']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=6, popup=label, color='black',fill=True,fill_color=colormap[int(cluster)],fill_opacity=0.7).add_to(map_clusters)
       
map_clusters