# __Choosing a Neighborhood__

In [31]:
import json
import folium
import requests
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize

In [32]:
# To show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

The list of neighborhoods within a given radius was found using the online tool at [Freeaptools.com.](https://www.freemaptools.com/find-cities-and-towns-inside-radius.htm) It made it easy to export the following .csv file, which contains all the cities and towns within a 10 mile radius of East Fishkill, NY, USA.

In [33]:
df = pd.read_csv('Neighborhoods.csv')

In [34]:
df.head()

Unnamed: 0,Hopewell Junction
0,Hillside Lake
1,East Fishkill
2,Fishkill Plains
3,Lomala
4,Swartoutville


##### There are two things we need to do to this dataframe. One, change the column header, and two, append ", NY" to each row. This is for the input to geocoder to get the latitude and longitude of each neighborhood.

In [35]:
df.rename(columns = {'Hopewell Junction' : 'Neighborhood'}, inplace = True)
df.head()

Unnamed: 0,Neighborhood
0,Hillside Lake
1,East Fishkill
2,Fishkill Plains
3,Lomala
4,Swartoutville


##### We'll append ", NY" as we read each row from the dataframe into geocoder.

In [36]:
# Create columns in the dataframe for the latitude and longitude
df['Latitude'] = ''
df['Longitude'] = ''

# Create the geocoder object
geolocator = Nominatim(user_agent="ny_explorer")

# Loop through neighborhoods, requesting the latitude and longitude for each one
for index, neighborhood in df.iterrows():
    address = '{}, NY'.format(neighborhood[0])    
    location = geolocator.geocode(address)
    df.iloc[index]['Latitude'] = location.latitude
    df.iloc[index]['Longitude'] = location.longitude

df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Hillside Lake,41.6185,-73.7935
1,East Fishkill,41.5533,-73.7956
2,Fishkill Plains,41.6072,-73.8302
3,Lomala,41.5598,-73.8412
4,Swartoutville,41.5634,-73.849


### Create initial Folium map:

In [38]:
# Latitude and longitude of new workplace
latitude = 41.58
longitude = -73.81


# Create Folium object
map = folium.Map(location=[latitude, longitude], zoom_start=7)

# Add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
    
map

##### If we zoom out, we see we have some outliers. We'll identify these by eye and remove them so they don't interfere with the classification.

In [8]:
df.shape

(95, 3)

In [39]:
outliers = ['Colonial Heights', 'Chelsea', 'Crown Heights', 'Milton', 'Clove', 'Berkshire Terrace']

for index, neighborhood in df.iterrows():
    for outlier in outliers:
        if neighborhood[0] == outlier:
            df.drop(index, inplace = True)

df.shape

(89, 3)

 ##### This means we have 89 neighborhoods to choose from. That's a lot.

In [10]:
# Now we'll redraw the map without the outliers
# Recreate Folium object
map = folium.Map(location=[latitude, longitude], zoom_start=10.4)

# Add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)  
    
map

### Using Foursquare

In [11]:
# Foursquare credentials
CLIENT_ID = '4V4LEON5PH4AQBIAYOZ0OZRGNLS35NC1LKAFAALX5YQ0PLW4'
CLIENT_SECRET = 'SUD5EDXWNSBLVCCCNWLXRZRTCKVOV1UCKOD4U1NIQA2XCTLM' 
VERSION = '20180605' 

In [12]:
# Foursquare request parameters
names, latitudes, longitudes = df['Neighborhood'], df['Latitude'], df['Longitude']
radius = 500
LIMIT = 100

venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):

    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)

    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']

    # return only relevant information for each nearby venue
    venues_list.append([(
        name, 
        lat, 
        lng, 
        v['venue']['name'],   
        v['venue']['categories'][0]['name']) for v in results])

venues0 = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
venues0.columns = ['Neighborhood', 
              'Latitude', 
              'Longitude', 
              'Venue', 
              'Venue Category']

venues0.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Venue,Venue Category
0,East Fishkill,41.553331,-73.795621,Plated Modern American Bistro,New American Restaurant
1,East Fishkill,41.553331,-73.795621,East Fishkill Provisions / Smoke Haus Deli,BBQ Joint
2,East Fishkill,41.553331,-73.795621,Inn At Arbor Ridge Hopewell Junction,Hotel
3,East Fishkill,41.553331,-73.795621,Mr G's Deli,Deli / Bodega
4,East Fishkill,41.553331,-73.795621,Shell,Gas Station


In [13]:
# We'll create a copy to lower the number of Foursquare API calls
venues = venues0.copy()
venues.shape

(453, 5)

In [14]:
venues.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Venue,Venue Category
0,East Fishkill,41.553331,-73.795621,Plated Modern American Bistro,New American Restaurant
1,East Fishkill,41.553331,-73.795621,East Fishkill Provisions / Smoke Haus Deli,BBQ Joint
2,East Fishkill,41.553331,-73.795621,Inn At Arbor Ridge Hopewell Junction,Hotel
3,East Fishkill,41.553331,-73.795621,Mr G's Deli,Deli / Bodega
4,East Fishkill,41.553331,-73.795621,Shell,Gas Station


##### There were 453 results returned. Let's group them by neighborhood.

In [15]:
count = venues.groupby('Neighborhood').agg(Count = pd.NamedAgg(column = 'Venue', aggfunc = 'count'))
count

Unnamed: 0_level_0,Count
Neighborhood,Unnamed: 1_level_1
Allen Corners,1
Arlington,6
Arthursburg,10
Beacon,42
Beacon Hills,3
Beekman,2
Billings,9
Brinckerhoff,2
Brockway,3
Camby,1


##### It looks like there are many neighborhoods with only a few restaurants. Let's sort the dataframe and remove those with less than 10.

In [16]:
count.sort_values(by = 'Count', ascending = False, inplace = True)
count.reset_index(inplace = True)

# Keep a list of neighborhoods to keep and drop all others
keep = []
for index, row in count.iterrows():
    if row[1] < 10:
        count.drop(index, inplace = True)
    else:
        keep.append(row[0])
    
     
count

Unnamed: 0,Neighborhood,Count
0,Knapps Corner,51
1,Beacon,42
2,Fishkill,33
3,Wappingers Falls,29
4,Newburgh,26
5,Pawling,21
6,Fishkill Plains,19
7,Poughkeepsie,18
8,Pleasant Valley,16
9,Arthursburg,10


##### This has reduced our neighborhood search from 89 neighborhoods to 10. Now, we'll make the changes to the original dataframe and convert it to one hot encoding.

In [17]:
for index, row in venues.iterrows():
    flag = False
    for k in keep:
        if k == row[0]:
            flag = True
    
    if flag == False:
        venues.drop(index, inplace = True)
        
venues.shape

(265, 5)

In [18]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhood'] = venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Arcade,Asian Restaurant,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Beer Garden,Big Box Store,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Burger Joint,Café,Candy Store,Chinese Restaurant,Clothing Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Creperie,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dive Bar,Donut Shop,Dry Cleaner,Fast Food Restaurant,Flea Market,Flower Shop,Food,French Restaurant,Furniture / Home Store,Gas Station,Gastropub,Gay Bar,Gift Shop,Gourmet Shop,Gym,Gym / Fitness Center,Harbor / Marina,Hardware Store,Health Food Store,Home Service,Hotel,Ice Cream Shop,Indie Movie Theater,Irish Pub,Italian Restaurant,Japanese Restaurant,Jewelry Store,Kids Store,Lawyer,Lingerie Store,Liquor Store,Martial Arts Dojo,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Movie Theater,Music Venue,New American Restaurant,Nightlife Spot,Optical Shop,Pet Store,Pharmacy,Pizza Place,Plaza,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shoe Store,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Spa,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Taco Place,Tapas Restaurant,Tattoo Parlor,Tex-Mex Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Train Station,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
5,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
# Pull list of categories
categories = onehot.columns.values.tolist()

# To simplify the process of finding just the restaurants, let's find all of them 
# with 'Restaurant' in the category type.
keep = []
for category in categories:
    if category.find('Restaurant') != -1:
        keep.append(category)

edit_onehot = onehot.drop(keep, axis = 1)
        
edit_onehot.head()

# The rest will be added to the keep list by eye. 
# (in practice, a list of keywords could easily be placed in the above for loop)

Unnamed: 0,Neighborhood,Accessories Store,Arcade,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Beer Garden,Big Box Store,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Burger Joint,Café,Candy Store,Clothing Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Creperie,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dive Bar,Donut Shop,Dry Cleaner,Flea Market,Flower Shop,Food,Furniture / Home Store,Gas Station,Gastropub,Gay Bar,Gift Shop,Gourmet Shop,Gym,Gym / Fitness Center,Harbor / Marina,Hardware Store,Health Food Store,Home Service,Hotel,Ice Cream Shop,Indie Movie Theater,Irish Pub,Jewelry Store,Kids Store,Lawyer,Lingerie Store,Liquor Store,Martial Arts Dojo,Men's Store,Mobile Phone Shop,Movie Theater,Music Venue,Nightlife Spot,Optical Shop,Pet Store,Pharmacy,Pizza Place,Plaza,Salon / Barbershop,Sandwich Place,Scenic Lookout,Shoe Store,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Spa,Steakhouse,Supermarket,Supplement Shop,Taco Place,Tattoo Parlor,Theater,Thrift / Vintage Store,Toy / Game Store,Train Station,Video Game Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
5,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Fishkill Plains,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
add_to_keep = ['BBQ Joint', 'Bagel Shop', 'Breakfast Spot', 'Burger Joint', 'Café', 'Coffee Shop', 'Deli / Bodega', 'Diner', 'Food', 'Gastropub', 'Pizza Place', 'Sandwich Place', 'Snack Place', 'Steakhouse', 'Taco Place', 'Wings Joint'] 

for k in add_to_keep:
    keep.append(k)
    
print(keep)

['American Restaurant', 'Asian Restaurant', 'Chinese Restaurant', 'Fast Food Restaurant', 'French Restaurant', 'Italian Restaurant', 'Japanese Restaurant', 'Mexican Restaurant', 'Middle Eastern Restaurant', 'New American Restaurant', 'Ramen Restaurant', 'Restaurant', 'Seafood Restaurant', 'Sushi Restaurant', 'Tapas Restaurant', 'Tex-Mex Restaurant', 'Vietnamese Restaurant', 'BBQ Joint', 'Bagel Shop', 'Breakfast Spot', 'Burger Joint', 'Café', 'Coffee Shop', 'Deli / Bodega', 'Diner', 'Food', 'Gastropub', 'Pizza Place', 'Sandwich Place', 'Snack Place', 'Steakhouse', 'Taco Place', 'Wings Joint']


##### Now we'll drop all other categories from the dataframe.

In [21]:
keep.append('Neighborhood')
onehot = onehot[keep]

onehot.head()

Unnamed: 0,American Restaurant,Asian Restaurant,Chinese Restaurant,Fast Food Restaurant,French Restaurant,Italian Restaurant,Japanese Restaurant,Mexican Restaurant,Middle Eastern Restaurant,New American Restaurant,Ramen Restaurant,Restaurant,Seafood Restaurant,Sushi Restaurant,Tapas Restaurant,Tex-Mex Restaurant,Vietnamese Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Burger Joint,Café,Coffee Shop,Deli / Bodega,Diner,Food,Gastropub,Pizza Place,Sandwich Place,Snack Place,Steakhouse,Taco Place,Wings Joint,Neighborhood
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,Fishkill Plains
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fishkill Plains
7,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fishkill Plains
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fishkill Plains
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fishkill Plains


##### Now we'll group them by neighborhood and take the mean of occurrence. 

In [22]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()
grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,Chinese Restaurant,Fast Food Restaurant,French Restaurant,Italian Restaurant,Japanese Restaurant,Mexican Restaurant,Middle Eastern Restaurant,New American Restaurant,Ramen Restaurant,Restaurant,Seafood Restaurant,Sushi Restaurant,Tapas Restaurant,Tex-Mex Restaurant,Vietnamese Restaurant,BBQ Joint,Bagel Shop,Breakfast Spot,Burger Joint,Café,Coffee Shop,Deli / Bodega,Diner,Food,Gastropub,Pizza Place,Sandwich Place,Snack Place,Steakhouse,Taco Place,Wings Joint
0,Arthursburg,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Beacon,0.095238,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.047619,0.02381,0.0,0.02381,0.0,0.0,0.047619,0.02381,0.0,0.0,0.0,0.0
2,Fishkill,0.030303,0.0,0.0,0.090909,0.0,0.151515,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.030303,0.0,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.030303,0.0,0.0
3,Fishkill Plains,0.0,0.0,0.0,0.052632,0.0,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.157895,0.052632,0.0,0.0,0.0,0.0
4,Knapps Corner,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0
5,Newburgh,0.076923,0.0,0.0,0.0,0.0,0.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.038462,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,0.038462,0.038462,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.076923,0.0
6,Pawling,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.047619,0.0
7,Pleasant Valley,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0625,0.0,0.0,0.0,0.0
8,Poughkeepsie,0.055556,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0
9,Wappingers Falls,0.068966,0.034483,0.0,0.034483,0.0,0.068966,0.0,0.034483,0.0,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068966,0.034483,0.0,0.0,0.0,0.034483


In [23]:
grouped.shape

(10, 34)

##### We'll use the following function in a loop to sort the columns by descending order.

In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighborhood'] = grouped['Neighborhood']

for ind in np.arange(grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Arthursburg,Burger Joint,Italian Restaurant,Café,Wings Joint,Middle Eastern Restaurant,Seafood Restaurant,Restaurant,Ramen Restaurant,New American Restaurant,Mexican Restaurant
1,Beacon,American Restaurant,Pizza Place,Café,Breakfast Spot,Asian Restaurant,Ramen Restaurant,Sandwich Place,Middle Eastern Restaurant,Diner,Mexican Restaurant
2,Fishkill,Italian Restaurant,Fast Food Restaurant,Restaurant,American Restaurant,Bagel Shop,Sushi Restaurant,Sandwich Place,Steakhouse,Chinese Restaurant,French Restaurant
3,Fishkill Plains,Pizza Place,Italian Restaurant,Sandwich Place,Fast Food Restaurant,Food,Tapas Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Seafood Restaurant,Restaurant
4,Knapps Corner,Tex-Mex Restaurant,Coffee Shop,Snack Place,Italian Restaurant,Sushi Restaurant,Seafood Restaurant,Restaurant,Ramen Restaurant,New American Restaurant,Middle Eastern Restaurant


# Cluster Neighborhoods by Restaurant Types Using K-Means

In [26]:
# set number of clusters
kclusters = 5

grouped_clustering = grouped.drop('Neighborhood', axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init = 30).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 3, 1, 0, 4, 0, 1, 0, 0], dtype=int32)

##### Now we'll add these cluster labels to the dataframe and plot them with Folium.

In [27]:
# add clustering labels
venues_sorted.insert(0, 'Cluster Label', kmeans.labels_)

venues_sorted['Latitude'] = ''
venues_sorted['Longitude'] = ''

for index, row in df.iterrows():
    for index2, row2 in venues_sorted.iterrows():
        if row['Neighborhood'] == row2['Neighborhood']:
            venues_sorted.loc[index2, 'Latitude'] = df.loc[index, 'Latitude']
            venues_sorted.loc[index2, 'Longitude'] = df.loc[index, 'Longitude']

# Reorder columns for clarity
venues_sorted = venues_sorted[['Neighborhood', 'Cluster Label', 'Latitude', 'Longitude', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue', '6th Most Common Venue', '7th Most Common Venue', '8th Most Common Venue', '9th Most Common Venue', '10th Most Common Venue']]
venues_sorted.head() 

Unnamed: 0,Neighborhood,Cluster Label,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Arthursburg,2,41.6284,-73.7715,Burger Joint,Italian Restaurant,Café,Wings Joint,Middle Eastern Restaurant,Seafood Restaurant,Restaurant,Ramen Restaurant,New American Restaurant,Mexican Restaurant
1,Beacon,0,41.5049,-73.9697,American Restaurant,Pizza Place,Café,Breakfast Spot,Asian Restaurant,Ramen Restaurant,Sandwich Place,Middle Eastern Restaurant,Diner,Mexican Restaurant
2,Fishkill,3,41.5356,-73.8987,Italian Restaurant,Fast Food Restaurant,Restaurant,American Restaurant,Bagel Shop,Sushi Restaurant,Sandwich Place,Steakhouse,Chinese Restaurant,French Restaurant
3,Fishkill Plains,1,41.6072,-73.8302,Pizza Place,Italian Restaurant,Sandwich Place,Fast Food Restaurant,Food,Tapas Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Seafood Restaurant,Restaurant
4,Knapps Corner,0,41.6286,-73.9166,Tex-Mex Restaurant,Coffee Shop,Snack Place,Italian Restaurant,Sushi Restaurant,Seafood Restaurant,Restaurant,Ramen Restaurant,New American Restaurant,Middle Eastern Restaurant


In [28]:
# Create Folium object
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10.4)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venues_sorted['Latitude'], venues_sorted['Longitude'], venues_sorted['Neighborhood'], venues_sorted['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

##### Now we've narrowed our search into 10 neighborhoods, each being classified into one of 5 categories. When running the k means with all 89 neighborhoods, the vast majority, almost all of them, were in the cluster shown in red. So, if we want to go somewhere that has the most typical variety of restaurants, which is what we would most likely choose, we've narrowed down our search to only 5 neighborhoods. Of course, we should explore the other areas too. Maybe we'd like the more atypical variety of restaurants more. 