## The Battle of the Neighborhoods - Week 2

### Part 1 Getting New York city geographical and population datasets's

The dataset exists for free on the web. Link to the dataset: https://geo.nyu.edu/catalog/nyu_2451_34572

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

import csv 

print('Libraries imported.')

Libraries imported.


The json file is downloaded from  'https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json'
and it is placed in this directory. 

#### Load and explore the data

In [3]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

A new variable is defined to read all the data about neighborhoods which is in the features key. 

In [4]:
neighborhoods_data = newyork['features']

Take a look at the first item in this list.

In [6]:
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

All this data is being read into a pandas dataframe.

In [7]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

neighborhoods = pd.DataFrame(columns=column_names)

In [8]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


Then loop through the data and fill the dataframe one row at a time.

In [9]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [10]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


Checking number of boroughs and neighborhoods in NewYork. 

In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(len(neighborhoods['Borough'].unique()), neighborhoods.shape[0]))

The dataframe has 5 boroughs and 306 neighborhoods.


In [14]:
neighborhoods.to_csv('NYC_GEO.csv',index=False)

#### Getting longitude and latitude using geopy library

In [12]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


#### Create a map of New York with neighborhoods superimposed on top.

**Folium** is used to visualize new york and its neighborhoods

In [13]:
# create map of Toronto using latitude and longitude values
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

### Getting Population and Demographics data of New York city from Wikipedia using Web Scrapping

In [16]:
from bs4 import BeautifulSoup # package for parsing HTML and XML documents

Web scrapping of Population data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [17]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('POPULATION.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [92]:
# Load data from csv
Pop_data=pd.read_csv('POPULATION.csv')
Pop_data

Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Gross Domestic Product,Land area,Density,Borough,County,Estimate (2017)[12],billions(US$)[13],per capita(US$),square miles,squarekm,persons / sq. mi,persons /sq. km
0,The Bronx\r\n,\r\n Bronx\r\n,"1,471,160\r\n",28.787\r\n,"19,570\r\n",42.10\r\n,109.04\r\n,"34,653\r\n","13,231\r\n",,,,,,
1,Brooklyn\r\n,\r\n Kings\r\n,"2,648,771\r\n",63.303\r\n,"23,900\r\n",70.82\r\n,183.42\r\n,"37,137\r\n","14,649\r\n",,,,,,
2,Manhattan\r\n,\r\n New York\r\n,"1,664,727\r\n",629.682\r\n,"378,250\r\n",22.83\r\n,59.13\r\n,"72,033\r\n","27,826\r\n",,,,,,
3,Queens\r\n,\r\n Queens\r\n,"2,358,582\r\n",73.842\r\n,"31,310\r\n",108.53\r\n,281.09\r\n,"21,460\r\n","8,354\r\n",,,,,,
4,Staten Island\r\n,\r\n Richmond\r\n,"479,458\r\n",11.249\r\n,"23,460\r\n",58.37\r\n,151.18\r\n,"8,112\r\n","3,132\r\n",,,,,,
5,City of New York,8622698,806.863,93574,302.64,783.83,28188,"10,947\r\n",,,,,,,
6,State of New York,19849399,1547.116,78354,47214,122284,416.4,159\r\n,,,,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,,,,


In [93]:
list(enumerate(Pop_data))

[(0, "New York City's five boroughsvte\r\n"),
 (1, 'Jurisdiction\r\n'),
 (2, 'Population\r\n'),
 (3, 'Gross Domestic Product\r\n'),
 (4, 'Land area\r\n'),
 (5, 'Density\r\n'),
 (6, 'Borough'),
 (7, 'County'),
 (8, 'Estimate (2017)[12]'),
 (9, 'billions(US$)[13]'),
 (10, 'per capita(US$)'),
 (11, 'square miles'),
 (12, 'squarekm'),
 (13, 'persons / sq. mi'),
 (14, 'persons /sq. km\r\n')]

In [94]:
#Dropping unnecessary columns
Pop_data.drop(Pop_data.columns[[3,4,9,10,11,12,13,14]], axis=1,inplace=True)
print('Data Cleaned')

Data Cleaned


In [95]:
# Remove white spaces and renaming columns
Pop_data.columns = Pop_data.columns.str.replace(' ', '')
Pop_data.columns = Pop_data.columns.str.replace('\'','')
Pop_data.rename(columns={'Estimate(2017)[12]':'persons_sq_km','County':'persons_sq_mi',
                        'Borough':'square_km'}, inplace=True)
Pop_data

Unnamed: 0,NewYorkCitysfiveboroughsvte,Jurisdiction,Population,Density,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\r\n,\r\n Bronx\r\n,"1,471,160\r\n",42.10\r\n,109.04\r\n,"34,653\r\n","13,231\r\n"
1,Brooklyn\r\n,\r\n Kings\r\n,"2,648,771\r\n",70.82\r\n,183.42\r\n,"37,137\r\n","14,649\r\n"
2,Manhattan\r\n,\r\n New York\r\n,"1,664,727\r\n",22.83\r\n,59.13\r\n,"72,033\r\n","27,826\r\n"
3,Queens\r\n,\r\n Queens\r\n,"2,358,582\r\n",108.53\r\n,281.09\r\n,"21,460\r\n","8,354\r\n"
4,Staten Island\r\n,\r\n Richmond\r\n,"479,458\r\n",58.37\r\n,151.18\r\n,"8,112\r\n","3,132\r\n"
5,City of New York,8622698,806.863,783.83,28188,"10,947\r\n",
6,State of New York,19849399,1547.116,122284,416.4,159\r\n,
7,Sources:[14] and see individual borough articl...,,,,,,


In [96]:
Pop_data.columns

Index(['NewYorkCitysfiveboroughsvte\r\n', 'Jurisdiction\r\n', 'Population\r\n',
       'Density\r\n', 'square_km', 'persons_sq_mi', 'persons_sq_km'],
      dtype='object')

In [97]:
Pop_data.rename(columns = {'NewYorkCitysfiveboroughsvte\r\n' : 'Borough',
                   'Jurisdiction\r\n':'County',
                   'Population\r\n':'Estimate_2017', 
                   'Density\r\n':'square_miles',
                    'Landarea\r\n':'square_km'}, inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\r\n,\r\n Bronx\r\n,"1,471,160\r\n",42.10\r\n,109.04\r\n,"34,653\r\n","13,231\r\n"
1,Brooklyn\r\n,\r\n Kings\r\n,"2,648,771\r\n",70.82\r\n,183.42\r\n,"37,137\r\n","14,649\r\n"
2,Manhattan\r\n,\r\n New York\r\n,"1,664,727\r\n",22.83\r\n,59.13\r\n,"72,033\r\n","27,826\r\n"
3,Queens\r\n,\r\n Queens\r\n,"2,358,582\r\n",108.53\r\n,281.09\r\n,"21,460\r\n","8,354\r\n"
4,Staten Island\r\n,\r\n Richmond\r\n,"479,458\r\n",58.37\r\n,151.18\r\n,"8,112\r\n","3,132\r\n"
5,City of New York,8622698,806.863,783.83,28188,"10,947\r\n",
6,State of New York,19849399,1547.116,122284,416.4,159\r\n,
7,Sources:[14] and see individual borough articl...,,,,,,


In [98]:
# Replace newline('\r\n') from each string from left and right sides
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\r\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\r\n', value='', regex=True)
Pop_data['Estimate_2017']=Pop_data['Estimate_2017'].replace(to_replace='\r\n', value='', regex=True)
Pop_data['square_miles']=Pop_data['square_miles'].replace(to_replace='\r\n', value='', regex=True)
Pop_data['square_km']=Pop_data['square_km'].replace(to_replace='\r\n', value='', regex=True)
Pop_data['persons_sq_mi']=Pop_data['persons_sq_mi'].replace(to_replace='\r\n', value='', regex=True)
Pop_data['persons_sq_km']=Pop_data['persons_sq_km'].replace(to_replace='\r\n', value='', regex=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1471160.0,42.1,109.04,34653.0,13231.0
1,Brooklyn,Kings,2648771.0,70.82,183.42,37137.0,14649.0
2,Manhattan,New York,1664727.0,22.83,59.13,72033.0,27826.0
3,Queens,Queens,2358582.0,108.53,281.09,21460.0,8354.0
4,Staten Island,Richmond,479458.0,58.37,151.18,8112.0,3132.0
5,City of New York,8622698,806.863,783.83,28188.0,10947.0,
6,State of New York,19849399,1547.116,122284.0,416.4,159.0,
7,Sources:[14] and see individual borough articles,,,,,,


In [99]:
# Shift data in the last two rows
Pop_data.loc[5:,['persons_sq_mi','persons_sq_km']] = Pop_data.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
Pop_data.loc[5:,['square_km','persons_sq_mi']] = Pop_data.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
Pop_data.loc[5:,['square_miles','square_km']] = Pop_data.loc[2:,['square_miles','square_km']].shift(1,axis=1)
Pop_data.loc[5:,['Estimate_2017','square_miles']] = Pop_data.loc[2:,['Estimate_2017','square_miles']].shift(1,axis=1)
Pop_data.loc[5:,['County','Estimate_2017']] = Pop_data.loc[2:,['County','Estimate_2017']].shift(1,axis=1)
Pop_data.loc[5:,['Borough','County']] = Pop_data.loc[2:,['Borough','County']].shift(1,axis=1)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1471160.0,42.1,109.04,34653.0,13231.0
1,Brooklyn,Kings,2648771.0,70.82,183.42,37137.0,14649.0
2,Manhattan,New York,1664727.0,22.83,59.13,72033.0,27826.0
3,Queens,Queens,2358582.0,108.53,281.09,21460.0,8354.0
4,Staten Island,Richmond,479458.0,58.37,151.18,8112.0,3132.0
5,,City of New York,8622698.0,806.863,783.83,28188.0,10947.0
6,,State of New York,19849399.0,1547.116,122284.0,416.4,159.0
7,,Sources:[14] and see individual borough articles,,,,,


In [100]:
#Replace NA
Pop_data = Pop_data.fillna('')
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1471160.0,42.1,109.04,34653.0,13231.0
1,Brooklyn,Kings,2648771.0,70.82,183.42,37137.0,14649.0
2,Manhattan,New York,1664727.0,22.83,59.13,72033.0,27826.0
3,Queens,Queens,2358582.0,108.53,281.09,21460.0,8354.0
4,Staten Island,Richmond,479458.0,58.37,151.18,8112.0,3132.0
5,,City of New York,8622698.0,806.863,783.83,28188.0,10947.0
6,,State of New York,19849399.0,1547.116,122284.0,416.4,159.0
7,,Sources:[14] and see individual borough articles,,,,,


In [102]:
# Drop last row
i = Pop_data[((Pop_data.County == 'Sources:[14] and see individual borough articles'))].index
Pop_data.drop(i)

Unnamed: 0,Borough,County,Estimate_2017,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1471160,42.1,109.04,34653.0,13231
1,Brooklyn,Kings,2648771,70.82,183.42,37137.0,14649
2,Manhattan,New York,1664727,22.83,59.13,72033.0,27826
3,Queens,Queens,2358582,108.53,281.09,21460.0,8354
4,Staten Island,Richmond,479458,58.37,151.18,8112.0,3132
5,,City of New York,8622698,806.863,783.83,28188.0,10947
6,,State of New York,19849399,1547.116,122284.0,416.4,159


In [103]:
# Save dataframe to csv
Pop_data.to_csv('POPULATION_1.csv',index=False)

### We will web scrap Demographics data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [105]:
website_url = requests.get('https://en.wikipedia.org/wiki/New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable collapsible'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('NYC_DEMO.csv', 'w', encoding="utf-8") as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [109]:
# Load demographic data
Demo_data=pd.read_csv('NYC_DEMO.csv')
Demo_data

Unnamed: 0,Racial composition,2010[249],1990[251],1970[251],1940[251]
0,White,44.0%,52.3%,76.6%,93.6%\r\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[252],92.0%\r\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\r\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[252],1.6%\r\n
4,Asian,12.7%,7.0%,1.2%,−\r\n


In [110]:
Demo_data.columns

Index(['Racial composition', '2010[249]', '1990[251]', '1970[251]',
       '1940[251]\r\n'],
      dtype='object')

In [111]:
#Renaming columns
Demo_data.rename(columns = {'2010[249]' : '2010',
                   '1990[251]':'1990',
                   '1970[251]':'1970', 
                   '1940[251]\r\n':'1940',
                    }, inplace=True)
Demo_data.columns = Demo_data.columns.str.replace(' ', '')
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%\r\n
1,—Non-Hispanic,33.3%,43.2%,62.9%[252],92.0%\r\n
2,Black or African American,25.5%,28.7%,21.1%,6.1%\r\n
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%[252],1.6%\r\n
4,Asian,12.7%,7.0%,1.2%,−\r\n


In [112]:
# Remove \r\n from all lines
Demo_data= Demo_data.replace('\r\n',' ', regex=True)
Demo_data['1970'] = Demo_data['1970'].str.rstrip('[252]')
Demo_data

Unnamed: 0,Racialcomposition,2010,1990,1970,1940
0,White,44.0%,52.3%,76.6%,93.6%
1,—Non-Hispanic,33.3%,43.2%,62.9%,92.0%
2,Black or African American,25.5%,28.7%,21.1%,6.1%
3,Hispanic or Latino (of any race),28.6%,24.4%,16.2%,1.6%
4,Asian,12.7%,7.0%,1.2%,−


In [113]:
#Saving this to csv
Demo_data.to_csv('DEMOGRAPHICS.csv',index=False)