In [1]:
#!conda install -c conda-forge geopy --yes

Solving environment: - ^C
failed

CondaError: KeyboardInterrupt



In [3]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1e             |       h516909a_0         2.1 MB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    ------------------------------------------------------------
                       

## The Battle of the Neighborhoods 
### Download and Explore New York city geographical coordinates dataset
Neighborhood has a total of 5 boroughs and 306 neighborhoods. In order to segement the neighborhoods and explore them, we will 
essentially need a dataset that contains the 5 boroughs and the neighborhoods that exist in each borough as well as the the latitude and logitude coordinates of each neighborhood.

 Link to the dataset: https://geo.nyu.edu/catalog/nyu_2451_34572

First, let's download all the dependencies that we will need.

In [4]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import csv # implements classes to read and write tabular data in CSV form

print('Libraries imported.')

Libraries imported.


In [5]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [6]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [7]:
neighborhoods_data = newyork_data['features']

In [8]:
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

### Tranform the data into a pandas dataframe
The next task is essentially transforming this data of nested Python dictionaries into a pandas dataframe. Start by creating an empty dataframe.

In [9]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


Then loop through the data and fill the dataframe one row at a time.

In [10]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [11]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


Let's make sure that the dataset has all 5 boroughs and 306 neighborhoods.

In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)


The dataframe has 5 boroughs and 306 neighborhoods.


In [13]:
neighborhoods.to_csv('BON1_NYC_GEO.csv',index=False)

### Use geopy library to get the latitude and longitude values of New York City.

In [14]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


## Create a map of New York with neighborhoods superimposed on top.
Folium is a great visualization library. We can zoom into the below map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

In [15]:
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

## Web scrapping of Population and Demographics data of New York city from Wikipedia

### A : POPULATION DATA
Web scrapping of Population data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [16]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

### Web scrapping of Population data from wikipedia page using BeautifulSoup.
Beautiful Soup is a Python package for parsing HTML and XML documents (including having malformed markup, i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping.

In [28]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('BON2_POPULATION1.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Demographics of New York City - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XnOZBApAMFMAAB7XNsYAAACU","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Demographics_of_New_York_City","wgTitle":"Demographics of New York City","wgCurRevisionId":942006397,"wgRevisionId":942006397,"wgArticleId":1729017,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive templat

In [68]:
Pop_data=pd.read_csv('BON2_POPULATION1.csv')
Pop_data.drop(Pop_data.columns[[9,10,11]], axis=1,inplace=True)
print('Data downloaded!')
Pop_data.head()

Data downloaded!


Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Gross Domestic Product,Land area,Density,Borough,County,Estimate (2018)[12],squarekm,persons / sq. mi,persons /km2
0,The Bronx\n,\n Bronx\n,"1,432,132\n",42.695\n,"29,200\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,
1,Brooklyn\n,\n Kings\n,"2,582,830\n",91.559\n,"34,600\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,
2,Manhattan\n,\n New York\n,"1,628,701\n",600.244\n,"360,900\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,
3,Queens\n,\n Queens\n,"2,278,906\n",93.310\n,"39,600\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,
4,Staten Island\n,\n Richmond\n,"476,179\n",14.514\n,"30,300\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,


## Remove whitespaces and rename columns

In [69]:
Pop_data.columns = Pop_data.columns.str.replace(' ', '')
Pop_data.columns = Pop_data.columns.str.replace('\'','')
Pop_data.rename(columns={'Borough':'persons_sq_mi','County':'persons_sq_km'}, inplace=True)
Pop_data.head(10)

Unnamed: 0,NewYorkCitysfiveboroughsvte,Jurisdiction,Population,GrossDomesticProduct,Landarea,Density,persons_sq_mi,persons_sq_km,Estimate(2018)[12],squarekm,persons/sq.mi,persons/km2
0,The Bronx\n,\n Bronx\n,"1,432,132\n",42.695\n,"29,200\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,
1,Brooklyn\n,\n Kings\n,"2,582,830\n",91.559\n,"34,600\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,
2,Manhattan\n,\n New York\n,"1,628,701\n",600.244\n,"360,900\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,
3,Queens\n,\n Queens\n,"2,278,906\n",93.310\n,"39,600\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,
4,Staten Island\n,\n Richmond\n,"476,179\n",14.514\n,"30,300\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,
5,City of New York,8398748,842.343,97700,302.64,783.83,28188,"10,947\n",,,,
6,State of New York,19745289,1701.399,85700,47214,122284,416.4,159\n,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,


In [70]:
Pop_data.rename(columns = {'NewYorkCitysfiveboroughsvte\n' : 'Borough',
                   'Jurisdiction\n':'County',
                   'Population\n':'Estimate_2017', 
                   'Landarea\n':'square_miles',
                    'Density\n':'square_km',
                          }, inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2018)[12],squarekm,persons/sq.mi,persons/km2
0,The Bronx\n,\n Bronx\n,"1,432,132\n",42.695\n,"29,200\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,
1,Brooklyn\n,\n Kings\n,"2,582,830\n",91.559\n,"34,600\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,
2,Manhattan\n,\n New York\n,"1,628,701\n",600.244\n,"360,900\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,
3,Queens\n,\n Queens\n,"2,278,906\n",93.310\n,"39,600\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,
4,Staten Island\n,\n Richmond\n,"476,179\n",14.514\n,"30,300\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,
5,City of New York,8398748,842.343,97700,302.64,783.83,28188,"10,947\n",,,,
6,State of New York,19745289,1701.399,85700,47214,122284,416.4,159\n,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,


## Replace newline('\n') from each string from left and right sides

In [71]:
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\n', value='', regex=True)
Pop_data['Estimate_2017']=Pop_data['Estimate_2017'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_miles']=Pop_data['square_miles'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_km']=Pop_data['square_km'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_mi']=Pop_data['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
Pop_data['squarekm']=Pop_data['squarekm'].replace(to_replace='\n', value='', regex=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2018)[12],squarekm,persons/sq.mi,persons/km2
0,The Bronx,Bronx,1432132.0,42.695\n,29200.0,42.1,109.04,"34,653\n","13,231\n",,,
1,Brooklyn,Kings,2582830.0,91.559\n,34600.0,70.82,183.42,"37,137\n","14,649\n",,,
2,Manhattan,New York,1628701.0,600.244\n,360900.0,22.83,59.13,"72,033\n","27,826\n",,,
3,Queens,Queens,2278906.0,93.310\n,39600.0,108.53,281.09,"21,460\n","8,354\n",,,
4,Staten Island,Richmond,476179.0,14.514\n,30300.0,58.37,151.18,"8,112\n","3,132\n",,,
5,City of New York,8398748,842.343,97700,302.64,783.83,28188.0,"10,947\n",,,,
6,State of New York,19745289,1701.399,85700,47214.0,122284.0,416.4,159\n,,,,
7,Sources:[14] and see individual borough articles,,,,,,,,,,,


In [72]:
#Pop_data.reindex('person_sq_km')
Pop_data.loc[5:,['persons_sq_mi','persons_sq_km']] = Pop_data.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
Pop_data.loc[5:,['square_km','persons_sq_mi']] = Pop_data.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
Pop_data.loc[5:,['square_miles','square_km']] = Pop_data.loc[2:,['square_miles','square_km']].shift(1,axis=1)
Pop_data.loc[5:,['Estimate_2017','square_miles']] = Pop_data.loc[2:,['Estimate_2017','square_miles']].shift(1,axis=1)
Pop_data.loc[5:,['County','Estimate_2017']] = Pop_data.loc[2:,['County','Estimate_2017']].shift(1,axis=1)
Pop_data.loc[5:,['Borough','County']] = Pop_data.loc[2:,['Borough','County']].shift(1,axis=1)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2018)[12],squarekm,persons/sq.mi,persons/km2
0,The Bronx,Bronx,1432132.0,42.695\n,29200.0,42.1,109.04,"34,653\n","13,231\n",,,
1,Brooklyn,Kings,2582830.0,91.559\n,34600.0,70.82,183.42,"37,137\n","14,649\n",,,
2,Manhattan,New York,1628701.0,600.244\n,360900.0,22.83,59.13,"72,033\n","27,826\n",,,
3,Queens,Queens,2278906.0,93.310\n,39600.0,108.53,281.09,"21,460\n","8,354\n",,,
4,Staten Island,Richmond,476179.0,14.514\n,30300.0,58.37,151.18,"8,112\n","3,132\n",,,
5,,City of New York,8398748.0,97700,842.343,302.64,783.83,28188,,,,
6,,State of New York,19745289.0,85700,1701.399,47214.0,122284.0,416.4,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,,


In [73]:
Pop_data = Pop_data.fillna('')
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2018)[12],squarekm,persons/sq.mi,persons/km2
0,The Bronx,Bronx,1432132.0,42.695\n,29200.0,42.1,109.04,"34,653\n","13,231\n",,,
1,Brooklyn,Kings,2582830.0,91.559\n,34600.0,70.82,183.42,"37,137\n","14,649\n",,,
2,Manhattan,New York,1628701.0,600.244\n,360900.0,22.83,59.13,"72,033\n","27,826\n",,,
3,Queens,Queens,2278906.0,93.310\n,39600.0,108.53,281.09,"21,460\n","8,354\n",,,
4,Staten Island,Richmond,476179.0,14.514\n,30300.0,58.37,151.18,"8,112\n","3,132\n",,,
5,,City of New York,8398748.0,97700,842.343,302.64,783.83,28188,,,,
6,,State of New York,19745289.0,85700,1701.399,47214.0,122284.0,416.4,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,,


In [74]:
i = Pop_data[((Pop_data.County == 'Sources: [2] and see individual borough articles'))].index
Pop_data.drop(i)

Unnamed: 0,Borough,County,Estimate_2017,GrossDomesticProduct,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2018)[12],squarekm,persons/sq.mi,persons/km2
0,The Bronx,Bronx,1432132.0,42.695\n,29200.0,42.1,109.04,"34,653\n","13,231\n",,,
1,Brooklyn,Kings,2582830.0,91.559\n,34600.0,70.82,183.42,"37,137\n","14,649\n",,,
2,Manhattan,New York,1628701.0,600.244\n,360900.0,22.83,59.13,"72,033\n","27,826\n",,,
3,Queens,Queens,2278906.0,93.310\n,39600.0,108.53,281.09,"21,460\n","8,354\n",,,
4,Staten Island,Richmond,476179.0,14.514\n,30300.0,58.37,151.18,"8,112\n","3,132\n",,,
5,,City of New York,8398748.0,97700,842.343,302.64,783.83,28188,,,,
6,,State of New York,19745289.0,85700,1701.399,47214.0,122284.0,416.4,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,,


In [75]:
Pop_data.to_csv('BON2_POPULATION.csv',index=False)

## B : DEMOGRAPHICS DATA
We will web scrap Demographics data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

Web scrapping of Demographics data from wikipedia page using BeautifulSoup.
Beautiful Soup is a Python package for parsing HTML and XML documents (including having malformed markup, i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping.

In [94]:
website_url = requests.get('https://en.wikipedia.org/w/index.php?title=New_York_City&oldid=861524529').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable collapsible'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
    td = row.find_all('td')
    row = [row.text for row in td]
    rows.append(row)

with open('NYC_DEMO.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(row for row in rows if row)

In [95]:
Demo_data=pd.read_csv('NYC_DEMO.csv')
print('Data downloaded!')

Data downloaded!


In [96]:
Demo_data

Unnamed: 0,Racial composition,2010[239],1990[241],1970[241],1940[241]
0,White,44.0%,52.3%,76.6%,93.6%\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[242],92.0%\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[242],1.6%\n
4,Asian,12.7%,7.0%,1.2%,−\n


In [97]:
Demo_data.columns

Index(['Racial composition', '2010[239]', '1990[241]', '1970[241]',
       '1940[241]\n'],
      dtype='object')

In [101]:
Demo_data.rename(columns = {'2010[239]' : '2010',
                   '1990[241]':'1990',
                   '1970[241]':'1970', 
                   '1940[241]\n':'1940',
                    }, inplace=True)
Demo_data

Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[242],92.0%\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[242],1.6%\n
4,Asian,12.7%,7.0%,1.2%,−\n


In [102]:

Demo_data.columns

Index(['Racial composition', '2010', '1990', '1970', '1940'], dtype='object')

In [103]:
Demo_data.columns = Demo_data.columns.str.replace(' ', '')

In [104]:

Demo_data= Demo_data.replace('\n',' ', regex=True)
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%[242],92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[242],1.6%
4,Asian,12.7%,7.0%,1.2%,−


In [105]:
Demo_data['1970'] = Demo_data['1970'].str.rstrip('[242]')
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%,92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%,1.6%
4,Asian,12.7%,7.0%,1.2%,−


## SAVE DATAFRAME TO CSV

In [107]:
Demo_data.to_csv('BON2_DEMOGRAPHICS.csv',index=False)

## Download and Explore New York city and its Boroughs Cuisine dataset

In [108]:
from PIL import Image

In [110]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot') # optional: for ggplot-like style

# check for latest version of Matplotlib
print ('Matplotlib version: ', mpl.__version__) # >= 2.0.0

# install wordcloud


# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS

print ('Wordcloud is installed and imported!')

Matplotlib version:  3.0.2


ModuleNotFoundError: No module named 'wordcloud'

In [113]:
!conda install -c conda-forge wordcloud==1.4.1 --yes
<div class="div-col columns column-width" style="-moz-column-width: 30em; -webkit-column-width: 30em; column-width: 30em;">
<ul><li><a href="/wiki/Bedford_Park,_Bronx" title="Bedford Park, Bronx">Bedford Park</a> – Mexican, Puerto Rican, Dominican, Korean (on 204th St.)</li>
<li><a href="/wiki/Belmont,_Bronx" title="Belmont, Bronx">Belmont</a> – Italian, Albanian (also known as "Arthur Avenue," "Little Italy")</li>
<li><a href="/wiki/City_Island,_Bronx" title="City Island, Bronx">City Island</a> – Italian, Seafood</li>
<li><a href="/wiki/Morris_Park,_Bronx" title="Morris Park, Bronx">Morris Park</a> – Italian, Albanian</li>
<li><a href="/wiki/Norwood,_Bronx" title="Norwood, Bronx">Norwood</a> – Filipino (formerly Irish, less so today)</li>
<li><a href="/wiki/Riverdale,_Bronx" title="Riverdale, Bronx">Riverdale</a> – Jewish</li>
<li><a href="/wiki/South_Bronx" title="South Bronx">South Bronx</a> – Puerto Rican, Dominican</li>
<li><a href="/wiki/Wakefield,_Bronx" title="Wakefield, Bronx">Wakefield</a> – Jamaican, West Indian</li>
<li><a href="/wiki/Woodlawn,_Bronx" title="Woodlawn, Bronx">Woodlawn</a> – Irish</li></ul>
 </div>

Solving environment: - ^C
failed

CondaError: KeyboardInterrupt



In [126]:
website_url = requests.get('https://en.wikipedia.org/wiki/Cuisine_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
uls = soup.find({'div':'div-col columns column-width'})
#print(soup.prettify())

headers = [header.text for header in ul.find_all('li')]

table_rows = ul.find_all('li')        
lis = []
for ul in uls:
    for li in ul.findAll('li'):
        if li.find('ul'):
            break
        lis.append(li)

with open('BON3_NYC_CUISINE.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(li for li in uls if li)

In [130]:
my_file = project.get_file("BON3_NYC_CUISINE.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
my_file.seek(0)
import pandas as pd
NYC_CUISINE=pd.read_csv("BON3_NYC_CUISINE.csv")
NYC_CUISINE.drop(NYC_CUISINE.columns[[3,4,5,6,7]], axis=1,inplace=True) 
NYC_CUISINE.head()

'my_file = project.get_file("BON3_NYC_CUISINE.csv")\n\n# Read the CSV data file from the object storage into a pandas DataFrame\nmy_file.seek(0)\nimport pandas as pd\nNYC_CUISINE=pd.read_csv("BON3_NYC_CUISINE.csv")\nNYC_CUISINE.drop(NYC_CUISINE.columns[[3,4,5,6,7]], axis=1,inplace=True) \nNYC_CUISINE.head()'

In [131]:
NYC_CUISINE.shape

'NYC_CUISINE.shape'

In [None]:
NYC_CUISINE['Borough'].value_counts().to_frame()

## 1. NEW YORK CITY CUISINE - WORD CLOUD

In [None]:
CUISINE_WC = NYC_CUISINE[['Cuisine']]
CUISINE_WC

In [None]:
CUISINE_WC.to_csv('CUISINE_WC.txt', sep=',', index=False)

In [None]:
CUISINE_WC1 = open('CUISINE_WC.txt', 'r').read()

In [None]:
stopwords = set(STOPWORDS)

In [None]:
NYC_CUISINE_WC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
NYC_CUISINE_WC.generate(CUISINE_WC1)

In [None]:
plt.imshow(NYC_CUISINE_WC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()


## Most Preferred Food in New York City -

### Italian
### Purto Rican
### Mexican
### Jewish
### Indian
### Pakistani
### Dominican

### BROOKLYN CUISINE - WORD CLOUD

In [None]:
Brooklyn_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'Brooklyn'].reset_index(drop=True)
Brooklyn_data.head()

In [None]:
BR_CUISINE_WC = Brooklyn_data[['Cuisine']]
BR_CUISINE_WC

In [None]:
BR_CUISINE_WC.to_csv('BR_CUISINE.txt', sep=',', index=False)
BR_CUISINE_WC = open('BR_CUISINE.txt', 'r').read()
stopwords = set(STOPWORDS)

In [None]:
BR_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
BR_CUISINE_NYC.generate(BR_CUISINE_WC)

In [None]:
plt.imshow(BR_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()


### Most Preferred Food in Brooklyn is -

### Italian
### Purto Rican
### Mexican

### QUEENS CUISINE - WORD CLOUD¶

In [None]:
Queens_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'Queens'].reset_index(drop=True)
Queens_data.head()

In [None]:

Q_CUISINE_WC = Queens_data[['Cuisine']]
Q_CUISINE_WC

In [None]:
Q_CUISINE_WC.to_csv('Q_CUISINE.txt', sep=',', index=False)

Q_CUISINE_WC = open('Q_CUISINE.txt', 'r').read()

stopwords = set(STOPWORDS)
# instantiate a word cloud object
Q_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
Q_CUISINE_NYC.generate(Q_CUISINE_WC)

In [None]:
plt.imshow(Q_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()

### MANHATTAN CUISINE - WORD CLOUD

In [None]:
Manhattan_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'Manhattan'].reset_index(drop=True)
Manhattan_data.head()
MN_CUISINE_WC = Manhattan_data[['Cuisine']]
MN_CUISINE_WC

In [None]:
MN_CUISINE_WC.to_csv('MN_CUISINE.txt', sep=',', index=False)

MN_CUISINE_WC = open('MN_CUISINE.txt', 'r').read()

stopwords = set(STOPWORDS)

# instantiate a word cloud object
MN_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
MN_CUISINE_NYC.generate(MN_CUISINE_WC)

<wordcloud.wordcloud.WordCloud at 0x7f562c126c50>

# display the word cloud
plt.imshow(MN_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()

In [None]:
Bronx_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'The Bronx'].reset_index(drop=True)
Bronx_data.head()

In [None]:
BX_CUISINE_WC = Bronx_data[['Cuisine']]
BX_CUISINE_WC

In [None]:
BX_CUISINE_WC.to_csv('BX_CUISINE.txt', sep=',', index=False)

BX_CUISINE_WC = open('BX_CUISINE.txt', 'r').read()

stopwords = set(STOPWORDS)

# instantiate a word cloud object
BX_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
BX_CUISINE_NYC.generate(BX_CUISINE_WC)

<wordcloud.wordcloud.WordCloud at 0x7f562c149438>

# display the word cloud
plt.imshow(BX_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()

## Most Preferred Food in The Bronx is -

### Italian
### Puerto Rican
### Albanian
### Dominican

### Download and Explore Farmers Market dataset

In [None]:
import seaborn as sns


In [None]:
my_file = project.get_file("DOHMH_Farmers_Markets_and_Food_Boxes.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
my_file.seek(0)
FM_NYC=pd.read_csv(my_file)
FM_NYC.head()

In [None]:
FM_NYC.rename(columns={'Service Type':'Service_Type'}, inplace=True)
print(FM_NYC.Service_Type.unique())
FM_NYC['Service_Type'].value_counts().to_frame()

In [None]:
fig.ax = plt.subplots(1, 1, figsize=(5, 5))
sns.countplot(x='Service_Type',data=FM_NYC)
ax.set_title("Service_Type")
for t in ax.patches:
    if (np.isnan(float(t.get_height()))):
        ax.annotate('', (t.get_x(), 0))
    else:
        ax.annotate(str(format(int(t.get_height()), ',d')), (t.get_x(), t.get_height()*1.01))
    
plt.show();

In [None]:
FM_NYC_filtered = FM_NYC[FM_NYC['Service_Type'] == 'Farmers Markets'].copy()
FM_NYC_filtered ['Borough'] = FM_NYC_filtered['Borough'].map(lambda x: x.strip())
print(FM_NYC_filtered.shape)
FM_NYC_filtered.head()

In [None]:
fig.ax = plt.subplots(1, 1, figsize=(5, 5))
sns.countplot(x='Borough',data=FM_NYC_filtered)
ax.set_title("Borough")
for t in ax.patches:
    if (np.isnan(float(t.get_height()))):
        ax.annotate('', (t.get_x(), 0))
    else:
        ax.annotate(str(format(int(t.get_height()), ',d')), (t.get_x(), t.get_height()*1.01))
        ax.set_xticklabels([t.get_text().split("T")[0] for t in ax.get_xticklabels()])

# This sets the yticks "upright" with 0, as opposed to sideways with 90.
plt.xticks(rotation=90) 
plt.show()

In [None]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

In [None]:
map_markets = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, FacilityName, borough in zip(FM_NYC_filtered['Latitude'], FM_NYC_filtered['Longitude'], FM_NYC_filtered['FacilityName'], FM_NYC_filtered['Borough']):
            label = '{}, {}'.format(FacilityName, borough)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color='green',
                fill=True,
                fill_color='green',
                fill_opacity=0.7,
                parse_html = False).add_to(map_markets)  

map_markets

### Segmenting and Clustering Neighborhoods - Brooklyn and Manhattan

### Introduction
In this section of the capstone project, we will use the Foursquare API to explore neighborhoods in Brooklyn and Manhattan. We will use the explore function to get the most common venue categories in each neighborhood, and then use this feature to group the neighborhoods into clusters. We will use the k-means clustering algorithm to complete this task. Finally, we will use the Folium library to visualize the neighborhoods in Brooklyn and Manhattan and their emerging clusters.

#### Table of Contents
1. Download and Explore Dataset
2. Explore Neighborhoods in Brooklyn and Manhattan
3. Analyze Each Neighborhood
4. Cluster Neighborhoods and Examine Clusters

Download all the dependencies that are needed.

In [None]:
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score

In [None]:
NYC_Geo=pd.read_csv('BON1_NYC_GEO.csv')
print('Data downloaded!')

In [None]:
NYC_Geo.head()


In [None]:
NYC_Geo['Borough'].value_counts().to_frame()
NYC_Geo.shape
print(NYC_Geo.Borough.unique())
NYC_Geo.isnull().sum()
BM_Geo = NYC_Geo.loc[(NYC_Geo['Borough'] == 'Brooklyn')|(NYC_Geo['Borough'] == 'Manhattan')]
BM_Geo = BM_Geo.reset_index(drop=True)
BM_Geo.head()

In [None]:
BM_Geo.shape

In [None]:
import time
start_time = time.time()

address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

print("--- %s seconds ---" % round((time.time() - start_time), 2))

In [None]:
map_BM = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(BM_Geo['Latitude'], BM_Geo['Longitude'], BM_Geo['Borough'], BM_Geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_BM)  
    
map_BM

In [None]:
CLIENT_ID = 'ET2OBTVBCI4JHKHQNMDH5V1CMM45MA0XB2CWSC3WXESGWXLA' # your Foursquare ID
CLIENT_SECRET = 'R0RI2RUOPB50DOVGIHTAKRPIYETIORH0GQM1YAQMUDLOZ3EJ' # your Foursquare Secret
VERSION = '20181218' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

### Explore Neighborhoods in Brooklyn and Manhattan

In [None]:
def getNearbyVenues(names, latitudes, longitudes, LIMIT=200, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
BM_venues = getNearbyVenues(names=BM_Geo['Neighborhood'],
                                  latitudes=BM_Geo['Latitude'],
                                  longitudes=BM_Geo['Longitude'],
                                  LIMIT=200)

print('The "BM_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(BM_venues['Venue Category']),
      len(BM_venues['Venue Category'].unique())))

BM_venues.to_csv('BM_venues.csv', sep=',', encoding='UTF8')
BM_venues.head()

In [None]:
colnames = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
BM_venues = pd.read_csv('BM_venues.csv', skiprows=1, names=colnames)
BM_venues.columns = BM_venues.columns.str.replace(' ', '')
BM_venues.head()

In [None]:
BM_venues.shape

In [None]:
def Venues_Map(Borough_name, Borough_neighborhoods):
    
    # Use geopy library to get the latitude and longitude values 
    geolocator = Nominatim(user_agent="Jupyter")
    Borough_location = geolocator.geocode(Borough_name) #'Brooklyn, NY'
    Borough_latitude = Borough_location.latitude
    Borough_longitude = Borough_location.longitude
    print('The geographical coordinates of "{}" are {}, {}.'.format(Borough_name, Borough_latitude, Borough_longitude))
    
    # To verify the number of Boroughs and Neighborhoods in the extracted data
    print('The "{}" dataframe has {} different venue types and {} neighborhoods.'.format(
          Borough_name,
          len(Borough_neighborhoods['VenueCategory'].unique()),
          len(Borough_neighborhoods['Neighborhood'].unique())))
    
    # create map of city using latitude and longitude values
    map_Borough = folium.Map(location=[Borough_latitude, Borough_longitude], zoom_start=10)

    # add markers to map
    for lat, lng, venue, category in zip(Borough_neighborhoods['VenueLatitude'], Borough_neighborhoods['VenueLongitude'], Borough_neighborhoods['Venue'], Borough_neighborhoods['VenueCategory']):
        label = '{}, {}'.format(category, venue)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=0.1,
            popup=label,
            color='red',
            fill=True,
            fill_color='#FF0000',
            fill_opacity=0.3).add_to(map_Borough)  

    return map_Borough

In [None]:
Venues_Map('New York City, NY', BM_venues)
BM_venues.groupby('VenueCategory')['Venue'].count().sort_values(ascending=False)

In [None]:
BM_venues.groupby('Neighborhood').count()
print('There are {} uniques categories.'.format(len(BM_venues['VenueCategory'].unique())))

### Analyze Each Neighborhood¶

In [None]:
# one hot encoding
BM_onehot = pd.get_dummies(BM_venues[['VenueCategory']], prefix="", prefix_sep="")

#column lists before adding neighborhood
column_names = ['Neighborhood'] + list(BM_onehot.columns)

# add neighborhood column back to dataframe
BM_onehot['Neighborhood'] = BM_venues['Neighborhood'] 

# move neighborhood column to the first column
BM_onehot = BM_onehot[column_names]

BM_onehot.head()

In [None]:
restaurant_List = []
search = 'Restaurant'
for i in BM_onehot.columns :
    if search in i:
        restaurant_List.append(i)
restaurant_List

In [None]:
col_name = []
col_name = ['Neighborhood'] + restaurant_List
BM_restaurant = BM_onehot[col_name]
BM_restaurant = BM_restaurant.iloc[:,1::]

In [None]:
BM_restaurant_grouped = BM_restaurant.groupby('Neighborhood').sum().reset_index()
BM_restaurant_grouped['Total'] = BM_restaurant_grouped .sum(axis=1)

### Cluster Neighborhoods and Examine Clusters
First, let's determine the optimal value of K for our dataset using the Silhouette Coefficient Method

From sklearn documentation - https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient

A higher Silhouette Coefficient score relates to a model with better-defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores: `

a: The mean distance between a sample and all other points in the same class.

b: The mean distance between a sample and all other points in the next nearest cluster.

The Silhouette Coefficient is for a single sample is then given as:

s=b-a/max(a,b)

Now, to find the optimal value of k for KMeans, loop through 1..n for n_clusters in KMeans and calculate Silhouette Coefficient for each sample.

A higher Silhouette Coefficient indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters.

In [None]:
BM_grouped_clustering = BM_restaurant_grouped.drop('Neighborhood', 1)

for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster).fit(BM_grouped_clustering)
    label = kmeans.labels_
    sil_coeff = silhouette_score(BM_grouped_clustering, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

### As we can see, n_clusters=2 has highest Silhouette Coefficient. This means that 2 should be the optimal number of clusters.
### For n_clusters=2, The Silhouette Coefficient is 0.4070573233323876

### Run k-means to cluster the neighborhood into 2 clusters.

In [None]:
kclusters = 2

BM_grouped_clustering = BM_restaurant_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(BM_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:
BM_results = pd.DataFrame(kmeans.cluster_centers_)
BM_results.columns = BM_grouped_clustering.columns
BM_results.index = ['cluster0','cluster1']
BM_results['Total Sum'] = BM_results.sum(axis = 1)
BM_results

### Dataframe with Neighborhood,Cluster No and Total Sum

In [None]:
BM_results_merged = pd.DataFrame(BM_restaurant_grouped['Neighborhood'])

BM_results_merged['Total'] = BM_restaurant_grouped['Total']
BM_results_merged = BM_results_merged.assign(Cluster_Labels = kmeans.labels_)

In [None]:
print(BM_results_merged.shape)
BM_results_merged
BM_merged = BM_Geo

BM_merged = BM_merged.join(BM_results_merged.set_index('Neighborhood'), on='Neighborhood')

print(BM_merged.shape)
BM_merged.head(10)

### Finally, let's visualize the resulting clusters

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(BM_merged['Latitude'], BM_merged['Longitude'], BM_merged['Neighborhood'], BM_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### List Neighborhoods of Interest in New York City

### CLUSTER 1

In [None]:
BM_merged[BM_merged['Cluster_Labels'] == 1].reset_index(drop=True)

### Cluster 0 : Untapped Markets

In [None]:
BM_merged[BM_merged['Total'] == 0].reset_index(drop=True)

### Segmenting and Clustering Neighborhoods - Bronx, Queens and Staten Island

### Introduction
#### In this section of the capstone project, we will use the Foursquare API to explore neighborhoods in Bronx, Queens and Staten Island.

Table of Contents
1. Download and Explore Dataset
2. Explore Neighborhoods in Bronx, Queens and Staten Island
3. Analyze Each Neighborhood
4. Cluster Neighborhoods and Examine Clusters

In [None]:
BQS_Geo = NYC_Geo.loc[(NYC_Geo['Borough'] == 'Bronx')|(NYC_Geo['Borough'] == 'Queens')|(NYC_Geo['Borough'] == 'Staten Island')]
BQS_Geo = BQS_Geo.reset_index(drop=True)
BQS_Geo.head()

In [None]:
BQS_Geo.shape

In [None]:
map_BQS = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(BQS_Geo['Latitude'], BQS_Geo['Longitude'], BQS_Geo['Borough'], BQS_Geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_BQS)  
    
map_BQS

 ### Explore Neighborhoods in Bronx, Queens and Staten Island¶¶
#### Run the above function on each neighborhood and create a new dataframe called BQS_venues.

In [None]:
BQS_venues = getNearbyVenues(names=BQS_Geo['Neighborhood'],
                                  latitudes=BQS_Geo['Latitude'],
                                  longitudes=BQS_Geo['Longitude'],
                                  LIMIT=200)

print('The "BQS_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(BQS_venues['Venue Category']),
      len(BQS_venues['Venue Category'].unique())))

BQS_venues.to_csv('BQS_venues.csv', sep=',', encoding='UTF8')
BQS_venues.head()

In [None]:
colnames = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
BQS_venues = pd.read_csv('BQS_venues.csv', skiprows=1, names=colnames)
BQS_venues.columns = BQS_venues.columns.str.replace(' ', '')
BQS_venues.head()

In [None]:
Venues_Map('New York City, NY', BQS_venues)
BQS_venues.groupby('VenueCategory')['Venue'].count().sort_values(ascending=False)

In [None]:
BQS_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(BQS_venues['VenueCategory'].unique())))

### Analyze Each Neighborhood

In [None]:
BQS_onehot = pd.get_dummies(BQS_venues[['VenueCategory']], prefix="", prefix_sep="")

#column lists before adding neighborhood
column_names = ['Neighborhood'] + list(BQS_onehot.columns)

# add neighborhood column back to dataframe
BQS_onehot['Neighborhood'] = BQS_venues['Neighborhood'] 

# move neighborhood column to the first column
BQS_onehot = BQS_onehot[column_names]

BQS_onehot.head()

In [None]:
restaurant_List1 = []
search = 'Restaurant'
for i in BQS_onehot.columns :
    if search in i:
        restaurant_List1.append(i)

In [None]:
col_name = []
col_name = ['Neighborhood'] + restaurant_List1
BQS_restaurant = BQS_onehot[col_name]
BQS_restaurant = BQS_restaurant.iloc[:,1::]

In [None]:
BQS_restaurant_grouped = BQS_restaurant.groupby('Neighborhood').sum().reset_index()

BQS_restaurant_grouped['Total'] = BQS_restaurant_grouped .sum(axis=1)

### Cluster Neighborhoods and Examine Clusters¶
#### First, let's determine the optimal value of K for our dataset using the Silhouette Coefficient Method

In [None]:
BQS_grouped_clustering = BQS_restaurant_grouped.drop('Neighborhood', 1)

for n_cluster in range(2, 10):
    kmeans = KMeans(n_clusters=n_cluster).fit(BQS_grouped_clustering)
    label = kmeans.labels_
    sil_coeff = silhouette_score(BQS_grouped_clustering, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

As we can see, n_clusters=2 has highest Silhouette Coefficient. This means that 2 should be the optimal number of clusters.
For n_clusters=2, The Silhouette Coefficient is 0.5480109689584506

Run k-means to cluster the neighborhood into 2 clusters.

In [None]:
kclusters = 2

BQS_grouped_clustering = BQS_restaurant_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(BQS_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:

BQS_results = pd.DataFrame(kmeans.cluster_centers_)
BQS_results.columns = BQS_grouped_clustering.columns
BQS_results.index = ['cluster0','cluster1']
BQS_results['Total Sum'] = BQS_results.sum(axis = 1)
BQS_results

### Dataframe with Neighborhood,Cluster No and Total Sum

In [None]:
BQS_results_merged = pd.DataFrame(BQS_restaurant_grouped['Neighborhood'],)
BQS_results_merged['Total'] = BQS_restaurant_grouped['Total']
BQS_results_merged = BQS_results_merged.assign(Cluster_Labels = kmeans.labels_)
print(BQS_results_merged.shape)
BQS_results_merged

In [None]:
BQS_merged = BQS_Geo

BQS_merged = BQS_merged.join(BQS_results_merged.set_index('Neighborhood'), on='Neighborhood')

print(BQS_merged.shape)
BQS_merged.head(10) # check the last columns!

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(BQS_merged['Latitude'], BQS_merged['Longitude'], BQS_merged['Neighborhood'], BQS_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### List Neighborhoods of Interest in New York City - Bronx, Queens and Staten Island
#### Cluster 1 : Saturated Markets

In [None]:
BQS_merged[BQS_merged['Cluster_Labels'] == 1].reset_index(drop=True)

### Cluster 0 : Untapped Markets

In [None]:
BQS_merged[BQS_merged['Total'] == 0].reset_index(drop=True)