# Download and Explore New York city geographical coordinates dataset

First, let's download and import all the libraries that we will need.

The Link to the dataset: https://geo.nyu.edu/catalog/nyu_2451_34572

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
import csv # implements classes to read and write tabular data in CSV form
print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


The JSON file that is placed on a server, hence we run wget command to access it.

In [2]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


#### Loading and Exploring the Data:

In [3]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [4]:
neighborhoods_data = newyork_data['features']

In [5]:
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

#### Transforming the data into a pandas DataFrame...

In [6]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [7]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [8]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [9]:
neighborhoods.to_csv('BON1_NYC_GEO.csv',index=False)

#### Using GeoPy Library to get the Latitude and Longitude Values of New York City:

In [10]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


#### Generating a Map (using Folium) with the Neighborhoods superimposed on Top:

In [11]:
# create map of Toronto using latitude and longitude values
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

## Webscraping the population and Demographics data of New York City:

In [12]:
import matplotlib.pyplot as plt
# conda install -c anaconda beautiful-soup --yes
from bs4 import BeautifulSoup # package for parsing HTML and XML documents
print('Libraries imported.')

Libraries imported.


#### Using BeautifulSoup, Web - Scrapping of population Data from Wikipedia Page.

In [13]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('BON2_POPULATION1.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

#### Hence, Loading the data from CSV:

In [14]:
Pop_data=pd.read_csv('BON2_POPULATION1.csv')
Pop_data.drop(Pop_data.columns[[9,10,11]], axis=1,inplace=True)
print('Data downloaded!')

Data downloaded!


In [15]:
Pop_data.columns = Pop_data.columns.str.replace(' ', '')
Pop_data.columns = Pop_data.columns.str.replace('\'','')
Pop_data.rename(columns={'Borough':'persons_sq_mi','County':'persons_sq_km'}, inplace=True)
Pop_data

Unnamed: 0,NewYorkCitysfiveboroughsvte\n,Jurisdiction\n,Population\n,GDP\n,Landarea\n,Density\n,persons_sq_mi,persons_sq_km,Estimate(2019),persons/mi2,persons/km2\n
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.695\n,42.10\n,109.04\n,"33,867\n","13,006\n",,,
1,Brooklyn\n,\n Kings\n,"2,559,903\n",91.559\n,70.82\n,183.42\n,"36,147\n","13,957\n",,,
2,Manhattan\n,\n New York\n,"1,628,706\n",600.244\n,22.83\n,59.13\n,"71,341\n","27,544\n",,,
3,Queens\n,\n Queens\n,"2,253,858\n",93.310\n,108.53\n,281.09\n,"20,767\n","8,018\n",,,
4,Staten Island\n,\n Richmond\n,"476,143\n",14.514\n,58.37\n,151.18\n,"8,157\n","3,150\n",,,
5,City of New York,8336817,842.343,302.64,783.83,27547,"10,636\n",,,,
6,State of New York,19453561,1731.910,47126.40,122056.82,412,159\n,,,,
7,Sources:[12][13][14] and see individual boroug...,,,,,,,,,,


In [16]:
Pop_data.rename(columns = {'NewYorkCitysfiveboroughsvte\n' : 'Borough',
                   'Jurisdiction\n':'County',
                   'Population\n':'Estimate_2017', 
                   'Landarea\n':'square_miles',
                    'Density\n':'square_km'}, inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GDP\n,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2019),persons/mi2,persons/km2\n
0,The Bronx\n,\n Bronx\n,"1,418,207\n",42.695\n,42.10\n,109.04\n,"33,867\n","13,006\n",,,
1,Brooklyn\n,\n Kings\n,"2,559,903\n",91.559\n,70.82\n,183.42\n,"36,147\n","13,957\n",,,
2,Manhattan\n,\n New York\n,"1,628,706\n",600.244\n,22.83\n,59.13\n,"71,341\n","27,544\n",,,
3,Queens\n,\n Queens\n,"2,253,858\n",93.310\n,108.53\n,281.09\n,"20,767\n","8,018\n",,,
4,Staten Island\n,\n Richmond\n,"476,143\n",14.514\n,58.37\n,151.18\n,"8,157\n","3,150\n",,,
5,City of New York,8336817,842.343,302.64,783.83,27547,"10,636\n",,,,
6,State of New York,19453561,1731.910,47126.40,122056.82,412,159\n,,,,
7,Sources:[12][13][14] and see individual boroug...,,,,,,,,,,


In [17]:
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\n', value='', regex=True)
Pop_data['Estimate_2017']=Pop_data['Estimate_2017'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_miles']=Pop_data['square_miles'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_km']=Pop_data['square_km'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_mi']=Pop_data['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_km']=Pop_data['persons_sq_km'].replace(to_replace='\n', value='', regex=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GDP\n,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2019),persons/mi2,persons/km2\n
0,The Bronx,Bronx,1418207.0,42.695\n,42.1,109.04,33867.0,13006.0,,,
1,Brooklyn,Kings,2559903.0,91.559\n,70.82,183.42,36147.0,13957.0,,,
2,Manhattan,New York,1628706.0,600.244\n,22.83,59.13,71341.0,27544.0,,,
3,Queens,Queens,2253858.0,93.310\n,108.53,281.09,20767.0,8018.0,,,
4,Staten Island,Richmond,476143.0,14.514\n,58.37,151.18,8157.0,3150.0,,,
5,City of New York,8336817,842.343,302.64,783.83,27547.0,10636.0,,,,
6,State of New York,19453561,1731.91,47126.40,122056.82,412.0,159.0,,,,
7,Sources:[12][13][14] and see individual boroug...,,,,,,,,,,


In [18]:
Pop_data.loc[5:,['persons_sq_mi','persons_sq_km']] = Pop_data.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
Pop_data.loc[5:,['square_km','persons_sq_mi']] = Pop_data.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
Pop_data.loc[5:,['square_miles','square_km']] = Pop_data.loc[2:,['square_miles','square_km']].shift(1,axis=1)
Pop_data.loc[5:,['Estimate_2017','square_miles']] = Pop_data.loc[2:,['Estimate_2017','square_miles']].shift(1,axis=1)
Pop_data.loc[5:,['County','Estimate_2017']] = Pop_data.loc[2:,['County','Estimate_2017']].shift(1,axis=1)
Pop_data.loc[5:,['Borough','County']] = Pop_data.loc[2:,['Borough','County']].shift(1,axis=1)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GDP\n,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2019),persons/mi2,persons/km2\n
0,The Bronx,Bronx,1418207.0,42.695\n,42.1,109.04,33867.0,13006.0,,,
1,Brooklyn,Kings,2559903.0,91.559\n,70.82,183.42,36147.0,13957.0,,,
2,Manhattan,New York,1628706.0,600.244\n,22.83,59.13,71341.0,27544.0,,,
3,Queens,Queens,2253858.0,93.310\n,108.53,281.09,20767.0,8018.0,,,
4,Staten Island,Richmond,476143.0,14.514\n,58.37,151.18,8157.0,3150.0,,,
5,,City of New York,8336817.0,302.64,842.343,783.83,27547.0,10636.0,,,
6,,State of New York,19453561.0,47126.40,1731.91,122056.82,412.0,159.0,,,
7,,Sources:[12][13][14] and see individual boroug...,,,,,,,,,


In [19]:
Pop_data = Pop_data.fillna('')
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GDP\n,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2019),persons/mi2,persons/km2\n
0,The Bronx,Bronx,1418207.0,42.695\n,42.1,109.04,33867.0,13006.0,,,
1,Brooklyn,Kings,2559903.0,91.559\n,70.82,183.42,36147.0,13957.0,,,
2,Manhattan,New York,1628706.0,600.244\n,22.83,59.13,71341.0,27544.0,,,
3,Queens,Queens,2253858.0,93.310\n,108.53,281.09,20767.0,8018.0,,,
4,Staten Island,Richmond,476143.0,14.514\n,58.37,151.18,8157.0,3150.0,,,
5,,City of New York,8336817.0,302.64,842.343,783.83,27547.0,10636.0,,,
6,,State of New York,19453561.0,47126.40,1731.91,122056.82,412.0,159.0,,,
7,,Sources:[12][13][14] and see individual boroug...,,,,,,,,,


#### Taking out the last row of Sources as it is Empty

In [20]:
i = Pop_data[((Pop_data.County == 'Sources:[12][13][14] and see individual borough articles'))].index
Pop_data.drop(i)

Unnamed: 0,Borough,County,Estimate_2017,GDP\n,square_miles,square_km,persons_sq_mi,persons_sq_km,Estimate(2019),persons/mi2,persons/km2\n
0,The Bronx,Bronx,1418207,42.695\n,42.1,109.04,33867,13006,,,
1,Brooklyn,Kings,2559903,91.559\n,70.82,183.42,36147,13957,,,
2,Manhattan,New York,1628706,600.244\n,22.83,59.13,71341,27544,,,
3,Queens,Queens,2253858,93.310\n,108.53,281.09,20767,8018,,,
4,Staten Island,Richmond,476143,14.514\n,58.37,151.18,8157,3150,,,
5,,City of New York,8336817,302.64,842.343,783.83,27547,10636,,,
6,,State of New York,19453561,47126.40,1731.91,122056.82,412,159,,,


In [21]:
Pop_data.to_csv('BON2_POPULATION.csv',index=False)

## DEMOGRAPHIC DATA:
#### We will web scrap Demographics data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [22]:
from bs4 import BeautifulSoup

In [23]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable collapsible collapsed'})
#print(soup.prettify())
headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('NYC_DEMO.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [24]:
Demo_data=pd.read_csv('NYC_DEMO.csv')
print('Data downloaded!')

Data downloaded!


In [25]:
Demo_data

Unnamed: 0,Racial composition\n,2010[citation needed]\n,1990[73]\n,1970[73]\n,1940[73]\n
0,White\n,44.0%\n,52.3%\n,76.6%\n,93.6%\n
1,—Non-Hispanic\n,33.3%\n,43.2%\n,62.9%[74]\n,92.0%\n
2,Black or African American\n,25.5%\n,28.7%\n,21.1%\n,6.1%\n
3,Hispanic or Latino (of any race)\n,28.6%\n,24.4%\n,16.2%[74]\n,1.6%\n
4,Asian\n,12.7%\n,7.0%\n,1.2%\n,–\n


In [26]:
Demo_data.columns

Index(['Racial composition\n', '2010[citation needed]\n', '1990[73]\n',
       '1970[73]\n', '1940[73]\n'],
      dtype='object')

In [27]:
Demo_data.rename(columns = {'Racial composition\n' : 'Racial composition',
    '2010[citation needed]\n' : '2010',
                   '1990[73]\n':'1990',
                   '1970[73]\n':'1970', 
                   '1940[73]\n':'1940',
                    }, inplace=True)
Demo_data

Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White\n,44.0%\n,52.3%\n,76.6%\n,93.6%\n
1,—Non-Hispanic\n,33.3%\n,43.2%\n,62.9%[74]\n,92.0%\n
2,Black or African American\n,25.5%\n,28.7%\n,21.1%\n,6.1%\n
3,Hispanic or Latino (of any race)\n,28.6%\n,24.4%\n,16.2%[74]\n,1.6%\n
4,Asian\n,12.7%\n,7.0%\n,1.2%\n,–\n


In [28]:
Demo_data.columns

Index(['Racial composition', '2010', '1990', '1970', '1940'], dtype='object')

In [29]:
Demo_data= Demo_data.replace('\n',' ', regex=True)
Demo_data

Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%[74],92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[74],1.6%
4,Asian,12.7%,7.0%,1.2%,–


In [30]:
Demo_data['1970'] = Demo_data['1970'].str.rstrip('[74]')
Demo_data

Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%[74],92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[74],1.6%
4,Asian,12.7%,7.0%,1.2%,–


In [31]:
Demo_data['1970'] = Demo_data['1970'].str.replace('[[74]]', '')
Demo_data

  compiled = re.compile(pat, flags=flags)


Unnamed: 0,Racial composition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%[7,92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[7,1.6%
4,Asian,12.7%,7.0%,1.2%,–


#### Saving the DataFrame as a CSV File.

In [32]:
Demo_data.to_csv('BON2_DEMOGRAPHICS.csv',index=False)

In [33]:
pip install wordcloud

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting wordcloud
  Downloading wordcloud-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (366 kB)
[K     |████████████████████████████████| 366 kB 31.4 MB/s eta 0:00:01
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.1
Note: you may need to restart the kernel to use updated packages.


In [34]:
from wordcloud import WordCloud, STOPWORDS

print ('Wordcloud is installed and imported!')

Wordcloud is installed and imported!


In [35]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('ggplot') # optional: for ggplot-like style
# check for latest version of Matplotlib
print ('Matplotlib version: ', mpl.__version__) # >= 2.0.0
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from PIL import Image # converting images into arrays

Matplotlib version:  3.2.2


In [36]:
!pip install keras==2.3.1
!pip install tensorflow==2.1.0
!pip install keras_applications==1.0.8
!pip install image-classifiers==1.0.0
!pip install efficientnet==1.0.0
print('Installed')

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting keras==2.3.1
  Downloading Keras-2.3.1-py2.py3-none-any.whl (377 kB)
[K     |████████████████████████████████| 377 kB 14.4 MB/s eta 0:00:01
Installing collected packages: keras
Successfully installed keras-2.3.1
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting tensorflow==2.1.0
  Downloading tensorflow-2.1.0-cp37-cp37m-manylinux2010_x86_64.whl (421.8 MB)
[K     |████████████████████████████████| 421.8 MB 37 kB/s s eta 0:00:01
Collecting tensorboard<2.2.0,>=2.1.0
  Downloading tensorboard-2.1.1-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 43.4 MB/s eta 0:00:01
Collecting tensorflow-estimator<2.2.0,>=2.1.0rc0
  Downloading tensorflow_estimator-2.1.0-py2.py3-none-any.whl (448 kB)
[K     |████████████████████████████████| 448 kB 49.2 MB/s eta 0:00:01
Collecting gast==0.2.2
  Downloading gast-0.

In [37]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
print('installed')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

installed


In [38]:
!conda install -c anaconda seaborn -y
import seaborn as sns

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [39]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker

# notice: installing seaborn might takes a few minutes
!conda install -c anaconda seaborn -y
import seaborn as sns

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.
