# Segmentation and Clustering the Neighborhoods in New York City

## 1. Before we start to collect the data and exploring it, let's download all the dependencies that we will need.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


## 2. Download and read the dataset of the neighborhoods in New York City and transform it into a *pandas* dataframe

#### Download and open the New York City data in JSON format

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

Data downloaded!


#### Transform the New York City data into dataframe

In [3]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

neighborhoods_data = newyork_data['features']
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

#### Manhattan is selected for this project in the New York City

In [4]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


## 3. Use the Foursquare API to explore neighbourhoods in Manhattan, New York City

#### Use geopy library to get the latitude and longitude values of Manhattan, New York City, then visualize Manhattan the neighbourhoods in it by creating a map using **Folium**

In [5]:
address = 'Manhattan, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

The geograpical coordinate of Manhattan are 40.7896239, -73.9598939.


##### **PLEASE VIEW THE VISUALIZED MAP BY DROP THE GITHUB LINK TO THIS .ipynb FILE INTO https://nbviewer.jupyter.org/**

#### Define Foursquare Credentials and Version.

In [6]:
CLIENT_ID = 'QR24W0AJYVEDYGP3OAWVWXEGVBW1X2NYSY5ZM0WW34I10AF2' # your Foursquare ID
CLIENT_SECRET = 'MMSVAVCUUGDAQKKD3R2HLDI4OMUEUCZVGUCCGUREXKY34WTZ' # your Foursquare Secret
VERSION = '20200517' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: QR24W0AJYVEDYGP3OAWVWXEGVBW1X2NYSY5ZM0WW34I10AF2
CLIENT_SECRET:MMSVAVCUUGDAQKKD3R2HLDI4OMUEUCZVGUCCGUREXKY34WTZ


#### Explore the neighborhoods in Manhattan. Create GET request URL, send the GET request, then clean and structure json into a new dataframe

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        LIMIT = 100 # limit of number of venues returned by Foursquare API
        radius = 500 # define radius
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


#### Show the size of the resulting dataframe and the unique categories which can be curated from all the returned venues

In [8]:
print(manhattan_venues.shape)
manhattan_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(manhattan_venues['Venue Category'].unique())))

(3062, 7)
There are 332 uniques categories.


## 4. Analyze the Neighborhoods in Manhattan

In [9]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]


# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()


# write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


# create the new dataframe and display the top 10 venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Park,Coffee Shop,Hotel,Memorial Site,Gym,Boat or Ferry,Gourmet Shop,Food Court,Shopping Mall,Wine Shop
1,Carnegie Hill,Coffee Shop,Pizza Place,Yoga Studio,Gym,Wine Shop,Bar,Bookstore,Café,Japanese Restaurant,Grocery Store
2,Central Harlem,African Restaurant,Fried Chicken Joint,American Restaurant,Bar,Cosmetics Shop,French Restaurant,Seafood Restaurant,Chinese Restaurant,Café,Tapas Restaurant
3,Chelsea,Art Gallery,Coffee Shop,Café,Ice Cream Shop,American Restaurant,Market,Seafood Restaurant,Boutique,Cupcake Shop,Cycle Studio
4,Chinatown,Chinese Restaurant,Cocktail Bar,Bubble Tea Shop,Bakery,Coffee Shop,Salon / Barbershop,Vietnamese Restaurant,Optical Shop,American Restaurant,Spa


## 5. Clustering the Neighborhoods and visualize the clusters on map

#### Run *k*-means to cluster the neighborhood into 5 clusters.

In [10]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 0, 2, 0, 2, 2, 0, 0, 2], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [11]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Manhattan,Marble Hill,40.876551,-73.91066,2,Sandwich Place,Coffee Shop,Gym,Ice Cream Shop,Department Store,Pharmacy,Diner,Discount Store,Donut Shop,Bank
1,Manhattan,Chinatown,40.715618,-73.994279,0,Chinese Restaurant,Cocktail Bar,Bubble Tea Shop,Bakery,Coffee Shop,Salon / Barbershop,Vietnamese Restaurant,Optical Shop,American Restaurant,Spa
2,Manhattan,Washington Heights,40.851903,-73.9369,1,Café,Bakery,Pizza Place,Mobile Phone Shop,Chinese Restaurant,Grocery Store,Latin American Restaurant,Donut Shop,Sandwich Place,Supplement Shop
3,Manhattan,Inwood,40.867684,-73.92121,1,Mexican Restaurant,Café,Lounge,Pizza Place,Restaurant,Deli / Bodega,Chinese Restaurant,Spanish Restaurant,Caribbean Restaurant,Park
4,Manhattan,Hamilton Heights,40.823604,-73.949688,1,Pizza Place,Coffee Shop,Café,Deli / Bodega,Mexican Restaurant,Cocktail Bar,Indian Restaurant,Sushi Restaurant,Park,Yoga Studio


#### Finally, let's visualize the resulting clusters

In [12]:
# create map
map_clusters_MH = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters_MH)
       
map_clusters_MH

##### **PLEASE VIEW THE VISUALIZED MAP BY DROP THE GITHUB LINK TO THIS .ipynb FILE INTO https://nbviewer.jupyter.org/**

## Examine Clusters

#### Cluster 0 (Red) Miscellaneous/Night Life

In [20]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Chinatown,Chinese Restaurant,Cocktail Bar,Bubble Tea Shop,Bakery,Coffee Shop,Salon / Barbershop,Vietnamese Restaurant,Optical Shop,American Restaurant,Spa
6,Central Harlem,African Restaurant,Fried Chicken Joint,American Restaurant,Bar,Cosmetics Shop,French Restaurant,Seafood Restaurant,Chinese Restaurant,Café,Tapas Restaurant
7,East Harlem,Mexican Restaurant,Bakery,Latin American Restaurant,Thai Restaurant,Deli / Bodega,Restaurant,Beer Bar,French Restaurant,Liquor Store,Steakhouse
19,East Village,Cocktail Bar,Pizza Place,Mexican Restaurant,Coffee Shop,Bar,Juice Bar,Wine Bar,Japanese Restaurant,Ramen Restaurant,Speakeasy
20,Lower East Side,Chinese Restaurant,Park,Cocktail Bar,Café,Ramen Restaurant,Art Gallery,Pet Café,Performing Arts Venue,Pharmacy,Ice Cream Shop
22,Little Italy,Bubble Tea Shop,Chinese Restaurant,Bakery,Spa,Mediterranean Restaurant,Italian Restaurant,Thai Restaurant,Hotel,Ice Cream Shop,Coffee Shop


#### Cluster 1 (Purple) Gastronomy

In [14]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Washington Heights,Café,Bakery,Pizza Place,Mobile Phone Shop,Chinese Restaurant,Grocery Store,Latin American Restaurant,Donut Shop,Sandwich Place,Supplement Shop
3,Inwood,Mexican Restaurant,Café,Lounge,Pizza Place,Restaurant,Deli / Bodega,Chinese Restaurant,Spanish Restaurant,Caribbean Restaurant,Park
4,Hamilton Heights,Pizza Place,Coffee Shop,Café,Deli / Bodega,Mexican Restaurant,Cocktail Bar,Indian Restaurant,Sushi Restaurant,Park,Yoga Studio
5,Manhattanville,Coffee Shop,Seafood Restaurant,Italian Restaurant,Deli / Bodega,Park,Sushi Restaurant,Mexican Restaurant,Café,Bike Trail,Boutique
9,Yorkville,Italian Restaurant,Coffee Shop,Gym,Sushi Restaurant,Bar,Deli / Bodega,Mexican Restaurant,Japanese Restaurant,Diner,Wine Shop
10,Lenox Hill,Coffee Shop,Italian Restaurant,Pizza Place,Café,Sushi Restaurant,Cocktail Bar,Gym / Fitness Center,Gym,Burger Joint,Salad Place
12,Upper West Side,Italian Restaurant,Coffee Shop,Dessert Shop,Bar,Middle Eastern Restaurant,Bookstore,Sushi Restaurant,Mediterranean Restaurant,Pizza Place,Bakery
18,Greenwich Village,Italian Restaurant,Coffee Shop,Café,Gym,Indian Restaurant,Ice Cream Shop,Bakery,Comedy Club,Clothing Store,Gourmet Shop
23,Soho,Italian Restaurant,Mediterranean Restaurant,Coffee Shop,Sandwich Place,French Restaurant,Clothing Store,Gym,Bakery,Café,Spa
25,Manhattan Valley,Coffee Shop,Spa,Pizza Place,Bar,Mexican Restaurant,Grocery Store,Playground,Park,Noodle House,Latin American Restaurant


#### Cluster 2 (Blue) Entertainment & Accommodation

In [15]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Marble Hill,Sandwich Place,Coffee Shop,Gym,Ice Cream Shop,Department Store,Pharmacy,Diner,Discount Store,Donut Shop,Bank
8,Upper East Side,Italian Restaurant,Bakery,Gym / Fitness Center,Spa,American Restaurant,Exhibit,Hotel,Juice Bar,Yoga Studio,Wine Shop
13,Lincoln Square,Italian Restaurant,Café,Plaza,Gym / Fitness Center,Theater,Performing Arts Venue,Concert Hall,Wine Shop,French Restaurant,Grocery Store
14,Clinton,Theater,Coffee Shop,Gym / Fitness Center,Hotel,Gym,Italian Restaurant,Wine Shop,Sandwich Place,Spa,American Restaurant
15,Midtown,Coffee Shop,Hotel,Theater,Pizza Place,Cuban Restaurant,Bakery,Japanese Restaurant,Clothing Store,Cosmetics Shop,Spa
16,Murray Hill,Hotel,Sandwich Place,Coffee Shop,Gym / Fitness Center,Steakhouse,Japanese Restaurant,Pizza Place,Chinese Restaurant,Indian Restaurant,Juice Bar
17,Chelsea,Art Gallery,Coffee Shop,Café,Ice Cream Shop,American Restaurant,Market,Seafood Restaurant,Boutique,Cupcake Shop,Cycle Studio
21,Tribeca,Italian Restaurant,Park,Wine Bar,Café,Spa,Hotel,Coffee Shop,Bakery,Steakhouse,Greek Restaurant
24,West Village,Italian Restaurant,American Restaurant,Wine Bar,New American Restaurant,Coffee Shop,Jazz Club,Bakery,Park,Cocktail Bar,Pizza Place
26,Morningside Heights,Park,Coffee Shop,Bookstore,American Restaurant,Deli / Bodega,Burger Joint,Sandwich Place,Tennis Court,New American Restaurant,Supermarket


#### Cluster 3 (Green) Nature

In [16]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,Roosevelt Island,Playground,Park,Gym,Dry Cleaner,Greek Restaurant,Coffee Shop,Outdoors & Recreation,Sandwich Place,Scenic Lookout,School


#### Cluster 4 (Yellow) Sport Hub

In [17]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,Stuyvesant Town,Park,Playground,Gas Station,Gym,Gym / Fitness Center,Baseball Field,Cocktail Bar,Harbor / Marina,German Restaurant,Bistro


# Thank you for your review!