## In this final capstone project, the aim is to cluster neighbourhoods based on their residential profile. To do this, first I load in all the necessary libraries 

I also set the Version and ID to get request from Foursquare API further down

In [23]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


CLIENT_ID = 'TTTYSUWFUMJXP00B4DT3JJU0RB4GZ5C25DIEK3EJ2QBMHWK0' # your Foursquare ID
CLIENT_SECRET = 'XU5LCC4PNFPR0JBA41A45ZHHHTAVTTNX1VJUZINIY1X2SF0E' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Libraries imported.


## First I load in the borough and neighbourhood data from NYC

In [19]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
    
neighborhoods_data = newyork_data['features']



Data downloaded!


## Next, I create a pandas dataframe and fill it with borough, neighbourhood, latitude and longitude

In [21]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']

neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [22]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


## In the next 2 cells, the important data is loaded from Foursquare API

First I create a function to create a URL which I will need to call multiple times to create instances for every category ID's. The category ID's are all residential building types ranging from hotels to appartments to trailer parks

In [24]:
def createURL(endpoint, CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, categoryId, limit):
    url = '{}&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
            endpoint,
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            categoryId,
            limit
        )
    return url

endpoint = 'https://api.foursquare.com/v2/venues/search?'
categoryIds = ['5032891291d4c4b30a586d68', '4bf58dd8d48988d103941735', '4f2a210c4b9023bd5841ed28',
               '4d954b06a243a5684965b473', '52f2ab2ebcbc57f1066b8b55', '4bf58dd8d48988d1fa931735',
               '4bf58dd8d48988d1f8931735', '4bf58dd8d48988d1ee931735', '56aa371be4b08b9a8d5734e1',
              ]
categoryId = ','.join(categoryIds)
limit = 50

In [27]:
radius = 1000
def getNearbyResidentialInfo(neighborhoods, latitudes, longitudes, radius):
    endpoint = 'https://api.foursquare.com/v2/venues/search?'
    venues_list = []   
    
    for hood_name, lat, lng in zip(neighborhoods, latitudes, longitudes):
    
        url = createURL(endpoint, CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, categoryId, limit)
        
        results = requests.get(url).json()['response']['venues']

        for item in results:
            venue_name = item['name']
            venue_category = item['categories'][0]['name']
            venue_lat = item['location']['lat']
            venue_lng = item['location']['lng']
            try:
                venue_city = item['location']['city']
            except:
                venue_city = 'N/A'
            venue_state = item['location']['state']
            
            venues_list.append([(hood_name,
                                lat,
                                lng,
                                venue_name,
                                venue_category,
                                venue_lat,
                                venue_lng,
                                venue_city,
                                venue_state
                               )])
            
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue Name',
                             'Venue Category',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue City',
                             'Venue State'
                            ]
    return nearby_venues

prelim_venue_data = getNearbyResidentialInfo(neighborhoods['Neighborhood'],
                                         neighborhoods['Latitude'],
                                         neighborhoods['Longitude'],
                                         radius
                                        )

In [34]:
print(prelim_venue_data.shape)
prelim_venue_data.groupby('Venue Category')['Venue Category'].count().sort_values(ascending=False)

(8917, 9)


Venue Category
Residential Building (Apartment / Condo)    6261
Hotel                                       1288
Housing Development                          639
Assisted Living                              226
Bed & Breakfast                               81
Building                                      76
Resort                                        65
Motel                                         59
Hostel                                        58
Medical Center                                20
College Residence Hall                        16
Boarding House                                11
Gym                                            8
Trailer Park                                   8
Coworking Space                                8
Other Great Outdoors                           8
Office                                         7
Garden                                         7
Hotel Pool                                     6
Vacation Rental                                6
Pool 

## After we have sourced the data from the Foursquare API, it needs to be cleaned.

In the cell below I get rid of all categories that are not related to residential buildings, or that do not have a city or neighbourhood assigned to them. The residential categories used for the clustering are 'Residential Building (Apartment / Condo)', 'Hotel', 'Housing Development', 'Assisted Living', 'Bed & Breakfast', 'Motel',
                            'Hostel', 'College Residence Hall', 'Trailer Park', 'Vacation Rental'

In [64]:
n_unique = len(prelim_venue_data['Venue Category'].unique())
print(f'There are {n_unique} unique venue categories in this dataframe')

ny_venue_data = prelim_venue_data[(prelim_venue_data['Venue State'] == "New York") | (prelim_venue_data['Venue State'] == "NY")]
ny_venue_data['Venue State'].replace(to_replace="New York", value="NY", inplace=True)
delta = prelim_venue_data.shape[0] - ny_venue_data.shape[0]
print(f'{delta} entries were removed from the preliminary dataset based on "Venue State"')

ny_venue_data.head(5)
ny_venue_data_with_city = ny_venue_data[(ny_venue_data['Venue City'] != "N/A")]
delta = ny_venue_data.shape[0] - ny_venue_data_with_city.shape[0]
print(f'{delta} entries were removed based on "Venue City"')

residential_related_categories = ['Residential Building (Apartment / Condo)', 'Hotel', 'Housing Development', 'Assisted Living', 'Bed & Breakfast', 'Motel',
                            'Hostel', 'College Residence Hall', 'Trailer Park', 'Vacation Rental']
ny_residential_venues = ny_venue_data_with_city[ny_venue_data_with_city['Venue Category'].isin(residential_related_categories)]
delta = ny_venue_data_with_city.shape[0] - ny_residential_venues.shape[0]
print(f'{delta} entries were removed based on "Venue Category" not being related to residential areas')
print(ny_residential_venues['Venue Category'].unique())
ny_residential_venues.head(5)

There are 43 unique venue categories in this dataframe
9 entries were removed from the preliminary dataset based on "Venue State"
505 entries were removed based on "Venue City"
250 entries were removed based on "Venue Category" not being related to residential areas
['Housing Development' 'Bed & Breakfast'
 'Residential Building (Apartment / Condo)' 'Hotel' 'Assisted Living'
 'Motel' 'Hostel' 'Vacation Rental' 'College Residence Hall'
 'Trailer Park']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Category,Venue Latitude,Venue Longitude,Venue City,Venue State
1,Wakefield,40.894705,-73.847201,Edenwald Houses - NYCHA,Housing Development,40.886606,-73.842116,Bronx,NY
2,Wakefield,40.894705,-73.847201,The Chamber,Bed & Breakfast,40.89156,-73.844467,Bronx,NY
3,Wakefield,40.894705,-73.847201,741 Hunts Point Avenue,Residential Building (Apartment / Condo),40.889555,-73.843807,Bronx,NY
5,Wakefield,40.894705,-73.847201,Andre Camp David (Edendwald Houses Southside),Housing Development,40.885182,-73.844488,Bronx,NY
6,Wakefield,40.894705,-73.847201,East 238th Street,Residential Building (Apartment / Condo),40.899167,-73.856684,Bronx,NY


## Now that our dataset is cleansed, we can start counting the amount of residential types per neighbourhood

In [63]:

ny_venue_category_onehot = pd.get_dummies(ny_residential_venues[['Venue Category']], prefix="", prefix_sep="")

ny_venue_category_onehot['Neighborhood'] = ny_residential_venues['Neighborhood'] 

fixed_columns = [ny_venue_category_onehot.columns[-1]] + list(ny_venue_category_onehot.columns[:-1])
ny_venue_category_onehot = ny_venue_category_onehot[fixed_columns]

print(ny_venue_category_onehot.shape)
venue_counts = ny_venue_category_onehot.groupby('Neighborhood').sum()
venue_counts.head(10)

(8153, 11)


Unnamed: 0_level_0,Assisted Living,Bed & Breakfast,College Residence Hall,Hostel,Hotel,Housing Development,Motel,Residential Building (Apartment / Condo),Trailer Park,Vacation Rental
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Allerton,4,2,0,0,0,13,1,26,0,0
Arden Heights,0,0,0,0,0,2,0,0,0,0
Arlington,0,0,0,0,0,2,0,1,0,0
Arrochar,0,0,0,0,1,0,0,5,0,0
Arverne,0,1,0,0,1,4,0,8,0,2
Astoria,0,0,0,0,1,2,0,44,0,0
Astoria Heights,0,0,0,2,4,0,1,12,0,0
Auburndale,3,0,0,0,0,1,0,6,0,0
Bath Beach,2,0,0,0,1,0,0,35,0,0
Battery Park City,0,0,0,0,28,0,0,18,0,0


## Here we take the venues per neighbourhood and divide them by the total, this way we get the proportion of a certain residential building type per neighbourhood

In [49]:
venue_totals = {}
for category in residential_related_categories:
    venue_totals[category] = venue_counts[category].sum()

venue_totals



venue_mean = pd.DataFrame()
for category, total in venue_totals.items():
    venue_mean[category] = venue_counts[category].apply(lambda x: x / total)
venue_mean = venue_mean.reindex(sorted(venue_mean.columns), axis=1).reset_index()
venue_mean.head(5)

Unnamed: 0,Neighborhood,Assisted Living,Bed & Breakfast,College Residence Hall,Hostel,Hotel,Housing Development,Motel,Residential Building (Apartment / Condo),Trailer Park,Vacation Rental
0,Allerton,0.018519,0.027778,0.0,0.0,0.0,0.020767,0.018182,0.004459,0.0,0.0
1,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.003195,0.0,0.0,0.0,0.0
2,Arlington,0.0,0.0,0.0,0.0,0.0,0.003195,0.0,0.000171,0.0,0.0
3,Arrochar,0.0,0.0,0.0,0.0,0.000784,0.0,0.0,0.000857,0.0,0.0
4,Arverne,0.0,0.013889,0.0,0.0,0.000784,0.00639,0.0,0.001372,0.0,0.333333


## Now that we have the proportion of each neighbourhood residential building per neighbourhood, we can make a top 5 for each neighbourhood

In [50]:
def return_top_venue_categories(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]



num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Top Venue Category'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Top Venue Category'.format(ind+1))

neighborhoods_top_venue_categories = pd.DataFrame(columns=columns)
neighborhoods_top_venue_categories['Neighborhood'] = venue_mean['Neighborhood']

for ind in np.arange(venue_mean.shape[0]):
    neighborhoods_top_venue_categories.iloc[ind, 1:] = return_top_venue_categories(venue_mean.iloc[ind, :], num_top_venues)

neighborhoods_top_venue_categories.head(5)

Unnamed: 0,Neighborhood,1st Top Venue Category,2nd Top Venue Category,3rd Top Venue Category,4th Top Venue Category,5th Top Venue Category
0,Allerton,Bed & Breakfast,Housing Development,Assisted Living,Motel,Residential Building (Apartment / Condo)
1,Arden Heights,Housing Development,Vacation Rental,Trailer Park,Residential Building (Apartment / Condo),Motel
2,Arlington,Housing Development,Residential Building (Apartment / Condo),Vacation Rental,Trailer Park,Motel
3,Arrochar,Residential Building (Apartment / Condo),Hotel,Vacation Rental,Trailer Park,Motel
4,Arverne,Vacation Rental,Bed & Breakfast,Housing Development,Residential Building (Apartment / Condo),Hotel



## Next we cluster the data based on mean residential types.


In [55]:
kclusters = 10

venue_grouped_clustering = venue_mean.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venue_grouped_clustering)

neighborhoods_top_venue_categories.insert(1,'Cluster Labels', kmeans.labels_)

ny_neighborhood_residential_profile = neighborhoods.drop(columns=['Borough'])

ny_neighborhood_residential_profile = ny_neighborhood_residential_profile.join(neighborhoods_top_venue_categories.set_index('Neighborhood'), on='Neighborhood').dropna(axis=0)

ny_neighborhood_residential_profile.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Top Venue Category,2nd Top Venue Category,3rd Top Venue Category,4th Top Venue Category,5th Top Venue Category
0,Wakefield,40.894705,-73.847201,0.0,Bed & Breakfast,Housing Development,Residential Building (Apartment / Condo),Vacation Rental,Trailer Park
1,Co-op City,40.874294,-73.829939,5.0,Assisted Living,Residential Building (Apartment / Condo),Hotel,Vacation Rental,Trailer Park
2,Eastchester,40.887556,-73.827806,5.0,Assisted Living,Hotel,Residential Building (Apartment / Condo),Vacation Rental,Trailer Park
3,Fieldston,40.895437,-73.905643,8.0,Motel,Assisted Living,Residential Building (Apartment / Condo),Housing Development,Vacation Rental
4,Riverdale,40.890834,-73.912585,5.0,Assisted Living,Residential Building (Apartment / Condo),Vacation Rental,Trailer Park,Motel


## Down below is the most interesting part of the project! a visualisation of the clusters on a map of NYC
## The image will not load on github, so the visualisation will be available in the report

In [69]:
latitude = 40.730610
longitude = -73.935242
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_neighborhood_residential_profile['Latitude'], ny_neighborhood_residential_profile['Longitude'], ny_neighborhood_residential_profile['Neighborhood'], ny_neighborhood_residential_profile['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


## Lastly we can analyse the clusters, this will be expanded on in the report

In [58]:
for cluster in range(0, kclusters):
    print(f'Cluster {cluster}:')
    print(ny_neighborhood_residential_profile.loc[ny_neighborhood_residential_profile['Cluster Labels'] == cluster, ny_neighborhood_residential_profile.columns[[0] + list(range(4, ny_neighborhood_residential_profile.shape[1]))]], end="\n\n\n")

Cluster 0:
          Neighborhood 1st Top Venue Category  \
0            Wakefield        Bed & Breakfast   
11      Pelham Parkway        Bed & Breakfast   
30         Parkchester        Bed & Breakfast   
35      Spuyten Duyvil        Bed & Breakfast   
44           Unionport        Bed & Breakfast   
46           Bay Ridge        Bed & Breakfast   
48         Sunset Park        Bed & Breakfast   
56       East Flatbush        Bed & Breakfast   
57          Kensington        Bed & Breakfast   
58     Windsor Terrace        Bed & Breakfast   
62            Bushwick        Bed & Breakfast   
99       Fort Hamilton        Bed & Breakfast   
137      Richmond Hill        Bed & Breakfast   
141      East Elmhurst        Bed & Breakfast   
143          Ridgewood        Bed & Breakfast   
155          Glen Oaks        Bed & Breakfast   
171      Broad Channel        Bed & Breakfast   
178     Rockaway Beach        Bed & Breakfast   
181        Floral Park        Bed & Breakfast   
191      