In [1]:
import os
import re
import math
from collections import defaultdict
import pandas as pd
import numpy as np
import json
import folium
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import display, HTML

# Turn slice copy warnings off
pd.options.mode.chained_assignment = None

import findspark
findspark.init()
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.types import ArrayType, StringType, DoubleType, IntegerType, FloatType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
data_folder = './data/'

housing_data_path = data_folder + 'Housing_New_York_Units_by_Building.csv'

In [3]:
# Read the data in a pandes dataframe (dataset is small enough to not use spark)
housing_data = pd.read_csv(housing_data_path)

#Show the first rows in a nice format
housing_data.head()

#Get an idea of the used columns
housing_data.columns


Index(['Project ID', 'Project Name', 'Project Start Date',
       'Project Completion Date', 'Building ID', 'Number', 'Street', 'Borough',
       'Postcode', 'BBL', 'BIN', 'Community Board', 'Council District',
       'Census Tract', 'NTA - Neighborhood Tabulation Area', 'Latitude',
       'Longitude', 'Latitude (Internal)', 'Longitude (Internal)',
       'Building Completion Date', 'Reporting Construction Type',
       'Extended Affordability Only', 'Prevailing Wage Status',
       'Extremely Low Income Units', 'Very Low Income Units',
       'Low Income Units', 'Moderate Income Units', 'Middle Income Units',
       'Other Income Units', 'Studio Units', '1-BR Units', '2-BR Units',
       '3-BR Units', '4-BR Units', '5-BR Units', '6-BR+ Units',
       'Unknown-BR Units', 'Counted Rental Units',
       'Counted Homeownership Units', 'All Counted Units', 'Total Units'],
      dtype='object')

We can see that there are a lot of columns which don't have any value for our research purpose. The only thing we want to achieve is to have an idea on the housing prices per borough. We don't care about different buildings/projects since we will aggregate them per borough

In [4]:
#Look for Nan Values
for i in housing_data.columns:
    print(i + ': '+str(housing_data[i].isna().sum()))
    
#Look for duplicate rows
dupl = housing_data.duplicated(keep=False).sum()
print("\n\nNumber of duplicate rows: "+str(dupl))


Project ID: 0
Project Name: 0
Project Start Date: 0
Project Completion Date: 1319
Building ID: 811
Number: 0
Street: 0
Borough: 0
Postcode: 907
BBL: 862
BIN: 1136
Community Board: 0
Council District: 6
Census Tract: 901
NTA - Neighborhood Tabulation Area: 901
Latitude: 907
Longitude: 907
Latitude (Internal): 887
Longitude (Internal): 887
Building Completion Date: 1123
Reporting Construction Type: 0
Extended Affordability Only: 0
Prevailing Wage Status: 0
Extremely Low Income Units: 0
Very Low Income Units: 0
Low Income Units: 0
Moderate Income Units: 0
Middle Income Units: 0
Other Income Units: 0
Studio Units: 0
1-BR Units: 0
2-BR Units: 0
3-BR Units: 0
4-BR Units: 0
5-BR Units: 0
6-BR+ Units: 0
Unknown-BR Units: 0
Counted Rental Units: 0
Counted Homeownership Units: 0
All Counted Units: 0
Total Units: 0


Number of duplicate rows: 0


There are no duplicate rows in the dataset but there are a quite a lot of rows with Nan Values. Fortunatly, none of the nan-values occur in columns we are interested in.

In [5]:
#We are only interested in the folowing columns
features = ['Borough', 'Extremely Low Income Units', 'Very Low Income Units', 'Low Income Units', 'Moderate Income Units', 'Middle Income Units', 'Other Income Units', 'Total Units']
housing_data = housing_data[features]
housing_data.head()


Unnamed: 0,Borough,Extremely Low Income Units,Very Low Income Units,Low Income Units,Moderate Income Units,Middle Income Units,Other Income Units,Total Units
0,Queens,0,0,0,2,0,0,2
1,Queens,0,0,2,0,0,0,2
2,Staten Island,0,0,1,0,0,0,1
3,Brooklyn,0,0,4,0,0,0,4
4,Brooklyn,0,0,2,0,0,0,2


Below you can find a description of the different columns: 
Extremely Low Income Units are units with rents that are affordable to households earning 0 to 30% of the area median income (AMI).

Very Low Income Units are units with rents that are affordable to households earning 31 to 50% of the area median income (AMI).

Low Income Units are units with rents that are affordable to households earning 51 to 80% of the area median income (AMI).

Moderate Income Units are units with rents that are affordable to households earning 81 to 120% of the area median income (AMI).

Middle Income Units are units with rents that are affordable to households earning 121 to 165% of the area median income (AMI).

Other Units are units reserved for building superintendents.

All these values are based on the Area Median Income. This is a value, describing the median income for a certain size of family in the NYC metropolitan area. For the year 2018, the values were like this:
* \$104.300 - Family of four
* \$ 93.900 - Family of three
* \$ 83.500 - Family of two
* \$ 73.100 - Individual

The category of a unit thus depends on the price of that unit and on the size of the unit

In [6]:
#Remove space from column names 
housing_data.columns = housing_data.columns.str.replace(' ', '_')

housing_data_agg = housing_data.groupby(['Borough']).sum()

#Let's create extra columns to see how much of the total units are units for one specific income
housing_data_agg['eli_rel']= housing_data_agg['Extremely_Low_Income_Units']/housing_data_agg['Total_Units']
housing_data_agg['vli_rel']= housing_data_agg['Very_Low_Income_Units']/housing_data_agg['Total_Units']
housing_data_agg['li_rel']= housing_data_agg['Low_Income_Units']/housing_data_agg['Total_Units']
housing_data_agg['moi_rel']= housing_data_agg['Moderate_Income_Units']/housing_data_agg['Total_Units']
housing_data_agg['mii_rel']= housing_data_agg['Middle_Income_Units']/housing_data_agg['Total_Units']
housing_data_agg['oi_rel']= housing_data_agg['Other_Income_Units']/housing_data_agg['Total_Units']

display(housing_data_agg.head())

housing_data_agg = housing_data_agg.reset_index()



Unnamed: 0_level_0,Extremely_Low_Income_Units,Very_Low_Income_Units,Low_Income_Units,Moderate_Income_Units,Middle_Income_Units,Other_Income_Units,Total_Units,eli_rel,vli_rel,li_rel,moi_rel,mii_rel,oi_rel
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Bronx,7223,8447,16587,1575,610,246,36000,0.200639,0.234639,0.46075,0.04375,0.016944,0.006833
Brooklyn,4702,7677,16339,1981,1949,150,43953,0.106978,0.174664,0.371738,0.045071,0.044343,0.003413
Manhattan,4209,7159,10964,1998,6982,116,46376,0.090758,0.154369,0.236415,0.043083,0.150552,0.002501
Queens,1151,2357,4443,1102,1017,27,11972,0.096141,0.196876,0.371116,0.092048,0.084948,0.002255
Staten Island,886,264,950,23,28,6,2703,0.327784,0.097669,0.351461,0.008509,0.010359,0.00222


In [7]:
#Create a map to see how housing prices are distributed over the differnt boroughs

m = folium.Map(location=[40.75, -74])

borough_edge = json.load(open(os.path.join('data', 'nyc-boroughs.json')))

folium.GeoJson(
    borough_edge,
    name='geojson'
).add_to(m)


m.choropleth(
    geo_data=borough_edge, 
             data=housing_data_agg,
             columns=['Borough', 'li_rel'],
             key_on = 'feature.properties.borough',
             fill_color='BuPu', fill_opacity=0.7, line_opacity=0.3,
             highlight=True)

m