In [2]:
import os
import re
import math
from collections import defaultdict
import pandas as pd
import numpy as np
import json
import folium
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import display, HTML

# Turn slice copy warnings off
pd.options.mode.chained_assignment = None

import findspark
findspark.init()
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.types import ArrayType, StringType, DoubleType, IntegerType, FloatType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [3]:
data_folder = './data/'

In [4]:
housing_data_path = data_folder + 'Housing_New_York_Units_by_Building.csv'

In [53]:
# Read the data in a spark dataframe
housing_data = spark.read.format("csv").option("header", "true").load(housing_data_path)

#Show the first rows in a nice format
df.limit(5).toPandas().head()

#Get an idea of the used columns
df.toPandas().describe()

Unnamed: 0,Project ID,Project Name,Project Start Date,Project Completion Date,Building ID,Number,Street,Borough,Postcode,BBL,...,2-BR Units,3-BR Units,4-BR Units,5-BR Units,6-BR+ Units,Unknown-BR Units,Counted Rental Units,Counted Homeownership Units,All Counted Units,Total Units
count,3396,3396,3396,2077,2585,3396,3396,3396,2489,2534,...,3396,3396,3396,3396,3396,3396,3396,3396,3396,3396
unique,1689,879,786,697,2577,1485,805,5,120,2108,...,129,68,27,6,4,19,227,89,249,288
top,53017,CONFIDENTIAL,06/25/2015,06/26/2017,927213,----,----,Brooklyn,11233,1009720001,...,0,0,0,0,0,0,0,0,1,1
freq,83,811,90,70,2,811,811,1449,141,35,...,1312,1916,3077,3362,3383,3066,1101,2219,829,825


Unnamed: 0,Project ID,Project Name,Project Start Date,Project Completion Date,Building ID,Number,Street,Borough,Postcode,BBL,...,2-BR Units,3-BR Units,4-BR Units,5-BR Units,6-BR+ Units,Unknown-BR Units,Counted Rental Units,Counted Homeownership Units,All Counted Units,Total Units
0,67358,CONFIDENTIAL,06/29/2018,,,----,----,Queens,,,...,0,2,0,0,0,0,0,2,2,2
1,67359,CONFIDENTIAL,06/29/2018,,,----,----,Queens,,,...,0,2,0,0,0,0,0,2,2,2
2,67362,CONFIDENTIAL,06/29/2018,,,----,----,Staten Island,,,...,1,0,0,0,0,0,0,1,1,1
3,67365,CONFIDENTIAL,06/29/2018,,,----,----,Brooklyn,,,...,4,0,0,0,0,0,0,4,4,4
4,67373,CONFIDENTIAL,06/29/2018,,,----,----,Brooklyn,,,...,0,0,0,0,0,2,0,2,2,2


In [44]:
#df.select('Borough').groupBy('PostCode').count().show()
df.select('Borough').distinct().show()


+-------------+
|      Borough|
+-------------+
|       Queens|
|     Brooklyn|
|Staten Island|
|    Manhattan|
|        Bronx|
+-------------+



['Project ID',
 'Project Name',
 'Project Start Date',
 'Project Completion Date',
 'Building ID',
 'Number',
 'Street',
 'Borough',
 'Postcode',
 'BBL',
 'BIN',
 'Community Board',
 'Council District',
 'Census Tract',
 'NTA - Neighborhood Tabulation Area',
 'Latitude',
 'Longitude',
 'Latitude (Internal)',
 'Longitude (Internal)',
 'Building Completion Date',
 'Reporting Construction Type',
 'Extended Affordability Only',
 'Prevailing Wage Status',
 'Extremely Low Income Units',
 'Very Low Income Units',
 'Low Income Units',
 'Moderate Income Units',
 'Middle Income Units',
 'Other Income Units',
 'Studio Units',
 '1-BR Units',
 '2-BR Units',
 '3-BR Units',
 '4-BR Units',
 '5-BR Units',
 '6-BR+ Units',
 'Unknown-BR Units',
 'Counted Rental Units',
 'Counted Homeownership Units',
 'All Counted Units',
 'Total Units']

In [52]:
exprs = [col(column).alias(column.replace(' ', '_')) for column in df.columns]
tmp = df.select(*exprs)
tmp = tmp.select(['Borough', 'Total_Units', 'Extremely_Low_Income_Units', 'Very Low Income Units', 'Moderate Income Units', 'Middle Income Units', 'Other Income Units'])

tmp = tmp.withColumn("nrUnits", tmp.Total_Units.cast(IntegerType()))
tmp = tmp.withColumn("eliUnits", tmp.Extremely_Low_Income_Units.cast(IntegerType()))

tmp.show()

tmp2 = tmp.select(['Borough', 'nrUnits', 'LowIncomeU']).groupBy('Borough').agg({"nrUnits": "sum", "LowIncomeU": "sum"})

tmp2 = tmp2.toPandas()
type(tmp2)
tmp2.columns

tmp2.head(5)

+-------------+-----------+--------------------------+-------+----------+
|      Borough|Total_Units|Extremely_Low_Income_Units|nrUnits|LowIncomeU|
+-------------+-----------+--------------------------+-------+----------+
|       Queens|          2|                         0|      2|         0|
|       Queens|          2|                         0|      2|         0|
|Staten Island|          1|                         0|      1|         0|
|     Brooklyn|          4|                         0|      4|         0|
|     Brooklyn|          2|                         0|      2|         0|
|Staten Island|          2|                         0|      2|         0|
|Staten Island|          2|                         0|      2|         0|
|     Brooklyn|          1|                         0|      1|         0|
|        Bronx|          2|                         0|      2|         0|
|        Bronx|          3|                         0|      3|         0|
|        Bronx|          2|           

Unnamed: 0,Borough,sum(LowIncomeU),sum(nrUnits)
0,Queens,1151,11972
1,Brooklyn,4702,43953
2,Staten Island,886,2703
3,Manhattan,4209,46376
4,Bronx,7223,36000


In [41]:
m = folium.Map(location=[40.75, -74])

borough_edge = json.load(open(os.path.join('data', 'nyc-boroughs.json')))

folium.GeoJson(
    borough_edge,
    name='geojson'
).add_to(m)


m.choropleth(
    geo_data=borough_edge, 
             data=tmp2,
             columns=['Borough', 'sum(nrUnits)'],
            key_on = 'properties.borough',
            fill_color='BuPu', fill_opacity=0.7, line_opacity=0.3,
            legend_name='Percentage of Voters Voting for UDC')

m