The libraries and data downloading and imports are based on the IBM course material 'Segmenting and Clustering Neighborhoods in New York City' (https://www.coursera.org/learn/applied-data-science-capstone/ungradedLti/f0QY7/segmenting-and-clustering-neighborhoods-in-new-york-city); I use this parts of codes as I'm doing its related homework and resolving further questions based on the this lab, and am not using these codes for any other purpose.

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium

print('Libraries imported.')

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

neighborhoods_data = newyork_data['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    borough = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

CLIENT_ID = 'ZFMJWYCKCXEOT5LN1RB22PKC5VHNOXRS0ABIT3Y3IXKUPJOX' # your Foursquare ID
CLIENT_SECRET = 'RABCP05LZHUHMV5DNVGBQYHAWAY3LPQPBISJ1NLNQWJ0X2EH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID:' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

We focus on restaurants in Manhattan

In [12]:
QUERY = 'restaurant'
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


Define functions and calculate diversity, density, price and quality metrics for each neighborhood

In [19]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    venues_summary = pd.DataFrame()
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&query={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            QUERY,
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
        
            results = requests.get(url).json()

            venues = results['response']['groups'][0]['items']

            nearby_venues = json_normalize(venues) # flatten JSON

            density_count = len(nearby_venues)

            # filter columns
            filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat',
                                'venue.location.lng', 'venue.id', 'venue.location.distance']
            nearby_venues =nearby_venues.loc[:, filtered_columns]

            # filter the category for each row
            nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

            # clean columns
            nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

            diversity_count = len(nearby_venues['categories'].value_counts())

            restaurant_id = nearby_venues.sort_values('distance').reset_index(drop = True).loc[0, 'id']

            VENUE_ID = restaurant_id
            url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}&'.format(
                VENUE_ID,
                CLIENT_ID, 
                CLIENT_SECRET,
                VERSION
            )
            
            results = requests.get(url).json()

            price_level = results['response']['venue']['price']['tier']

            quality_level = results['response']['venue']['likes']['count']
            
            venues_summary.loc[name, 'Density'] = density_count 
            venues_summary.loc[name, 'Diversity'] = diversity_count
            venues_summary.loc[name, 'Price'] = price_level
            venues_summary.loc[name, 'Quality'] = quality_level
        except:
            continue

    
    return(venues_summary.reset_index())

result = getNearbyVenues(manhattan_data.loc[:, 'Neighborhood'], manhattan_data.loc[:, 'Latitude'],
                         manhattan_data.loc[:, 'Longitude'])

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


Scale different metrics and get the final score

In [32]:
result = result.set_index('index', drop = True)
df_summary = result.divide(result.max(axis = 0), axis = 1)
df_summary['Score'] = df_summary['Density'] + df_summary['Diversity'] + df_summary['Quality'] - df_summary['Price']

Which neighborhood has most restaurants?

In [38]:
df_summary.sort_values('Density', ascending = False).head()

Unnamed: 0_level_0,Density,Diversity,Price,Quality,Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Chelsea,1.0,0.914894,0.75,0.004638,1.169532
Midtown,1.0,0.851064,0.5,0.204082,1.555145
Financial District,1.0,0.851064,0.25,0.000928,1.601991
Midtown South,1.0,0.702128,0.25,0.004638,1.456766
West Village,1.0,0.851064,0.75,0.650278,1.751342


Which neighborhood has the most different kinds of restaurants?

In [41]:
df_summary.sort_values('Diversity', ascending = False).head()

Unnamed: 0_level_0,Density,Diversity,Price,Quality,Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East Village,1.0,1.0,0.5,1.0,2.5
Murray Hill,1.0,0.957447,1.0,0.050093,1.00754
Chelsea,1.0,0.914894,0.75,0.004638,1.169532
Clinton,1.0,0.893617,0.25,0.000928,1.644545
Flatiron,1.0,0.893617,0.75,0.349722,1.493339


Which neighborhood has the cheapest food?

In [42]:
df_summary.sort_values('Price', ascending = True).head()

Unnamed: 0_level_0,Density,Diversity,Price,Quality,Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Marble Hill,0.15,0.234043,0.25,0.0,0.134043
Stuyvesant Town,0.05,0.106383,0.25,0.029685,-0.063932
Tudor City,0.81,0.659574,0.25,0.009276,1.228851
Turtle Bay,0.86,0.659574,0.25,0.022263,1.291838
Midtown South,1.0,0.702128,0.25,0.004638,1.456766


Which neighborhood has the restaurant with the best quality?

In [43]:
df_summary.sort_values('Quality', ascending = False).head()

Unnamed: 0_level_0,Density,Diversity,Price,Quality,Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East Village,1.0,1.0,0.5,1.0,2.5
Noho,1.0,0.893617,0.5,0.80334,2.196957
Little Italy,1.0,0.808511,0.5,0.71243,2.020941
West Village,1.0,0.851064,0.75,0.650278,1.751342
Greenwich Village,1.0,0.851064,0.75,0.469388,1.570452


Which neighborhood has best restaurants in general?

In [45]:
df_summary.sort_values('Score', ascending = False).head()

Unnamed: 0_level_0,Density,Diversity,Price,Quality,Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
East Village,1.0,1.0,0.5,1.0,2.5
Noho,1.0,0.893617,0.5,0.80334,2.196957
Little Italy,1.0,0.808511,0.5,0.71243,2.020941
West Village,1.0,0.851064,0.75,0.650278,1.751342
Soho,1.0,0.765957,0.5,0.456401,1.722358


Finally, we use K-Means Algo to find the communities like East Village

In [51]:
kclusters = 6
df_final = df_summary.sort_values('Score', ascending = False).copy()
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_final.drop('Score', 1))
df_final.insert(0, 'Cluster_Labels', kmeans.labels_)

In [52]:
df_final

Unnamed: 0_level_0,Cluster_Labels,Density,Diversity,Price,Quality,Score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East Village,0,1.0,1.0,0.5,1.0,2.5
Noho,0,1.0,0.893617,0.5,0.80334,2.196957
Little Italy,0,1.0,0.808511,0.5,0.71243,2.020941
West Village,5,1.0,0.851064,0.75,0.650278,1.751342
Soho,5,1.0,0.765957,0.5,0.456401,1.722358
Chinatown,3,1.0,0.787234,0.25,0.179963,1.717197
Clinton,3,1.0,0.893617,0.25,0.000928,1.644545
Financial District,3,1.0,0.851064,0.25,0.000928,1.601991
Greenwich Village,5,1.0,0.851064,0.75,0.469388,1.570452
Midtown,3,1.0,0.851064,0.5,0.204082,1.555145
