# NYC write joined PLUTO-LL84 dataset

In this notebook we join the PLUTO dataset to the LL84 dataset by BBL, then create the validation dataset that we use in the paper by augmenting the joined dataset with HDD and CDD data.

Because we cannot distribute the HDD and CDD rasters that we have used in processing this data, we have created a file `data/hdd_cdd_2015.csv` that contains the HDD and CDD values of each building in the joined dataset (by latitude and longitude).

In [1]:
import sys
import os
import time

import pandas as pd
import numpy as np

import CBECSLib

In [2]:
USE_CACHED_DATA = True

if USE_CACHED_DATA:
    ds = {}
    
    f = open("data/hdd_cdd_2015.csv","r")
    f.readline()
    for line in f:
        line = line.strip()
        if line!="":
            lat,lon,cdd,hdd = map(float, line.split(","))
            ds[(lat,lon)] = (cdd,hdd)
    f.close()
    
    def getDegreeDays(lat, lon):
        return ds[(lat,lon)]
else:
    from netCDF4 import Dataset
    
    ds = Dataset(os.path.expanduser("~/Dropbox/data/Degree Days/degree_day_rasters/rcp8.5_noaa/cddnoaa_2015.nc"),"r")
    cdds = ds.variables["cooldegday"][:]
    lats = ds.variables["lat"][:]
    lons = ds.variables["lon"][:]
    ds.close()

    ds = Dataset(os.path.expanduser("~/Dropbox/data/Degree Days/degree_day_rasters/rcp8.5_noaa/hddnoaa_2015.nc"),"r")
    hdds = ds.variables["heatdegday"][:]
    ds.close()

    def getIndices(lat, lon):
        latIndex = np.searchsorted(lats,lat)
        lonIndex = np.searchsorted(lons,lon)

        return latIndex,lonIndex

    def getDegreeDays(lat, lon):
        ''' Call as: getDegreeDays(33.7490,-84.3880)
        '''
        cdd = cdds[getIndices(lat,lon)]
        hdd = hdds[getIndices(lat,lon)]

        return cdd, hdd

In [3]:
f = open("data/nyc/NYC_PropertyType_PBA_mapping.csv","r")
lines = f.read().strip().split("\n")
f.close()
propertyTypeMapping = {}
for line in lines:
    k,v = line.split("|")
    propertyTypeMapping[k] = int(v)

In [4]:
f = open("output/nyc/centroidList.csv","r")
lines = f.read().strip().split("\n")
f.close()
centroidMapping = {}
for line in lines:
    bbl,lon,lat = line.split(",")
    centroidMapping[float(bbl)] = (float(lat), float(lon))

# Create PLUTO dataset

Here we combine all of the indivdual PLUTO dataset files into a single CSV.

In [5]:
BASE_DIR = "data/nyc/BORO_zip_files_csv"

In [6]:
csvFrames = []
for fn in os.listdir(BASE_DIR):
    if fn.endswith(".csv"):
        df = pd.read_csv(os.path.join(BASE_DIR, fn), header=0)
        csvFrames.append(df)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
df = pd.concat(csvFrames)
df = df.reset_index(drop=True)

In [8]:
df.to_csv("output/nyc/allBuildings.csv",sep="|",encoding='utf-8')

# Load PLUTO dataset

In [9]:
plutoData = pd.read_csv("output/nyc/allBuildings.csv", header=0, sep="|", encoding='utf-8')

columnsToKeep = [
    "NumFloors",
    "BBL"
]
print plutoData.shape
plutoData.dropna(subset=columnsToKeep, inplace=True)
print plutoData.shape

plutoNumFloorMapping = {}
for row in plutoData.iterrows():
    row = row[1]
    
    bbl = row["BBL"]
    numFloors  = row["NumFloors"]
    
    plutoNumFloorMapping[bbl] = numFloors
print len(plutoNumFloorMapping)

  interactivity=interactivity, compiler=compiler, result=result)


(858370, 85)
(858370, 85)
858370


In [10]:
ll84Data = pd.read_csv("data/nyc/nyc_benchmarking_disclosure_data_reported_in_2016.csv", header=0, delimiter="|")

columnsToKeep = [
    "NYC Borough, Block and Lot (BBL)",
    "Primary Property Type - Self Selected",
    "Source EUI (kBtu/ft²)",
    "Site EUI (kBtu/ft²)",
    "Property GFA - Self-reported (ft²)",
]
print ll84Data.shape
ll84Data.dropna(subset=columnsToKeep, inplace=True)
print ll84Data.shape

(13223, 57)
(11438, 57)


In [11]:
tempLatLons = {}

valData = []
valTargets = []
valClassVals = []
valBBLs = []
for row in ll84Data.iterrows():
    
    row = row[1]
    
    bbl = row["NYC Borough, Block and Lot (BBL)"]
    primaryPropertyType = row["Primary Property Type - Self Selected"]
    sourceEUI = float(row["Source EUI (kBtu/ft²)"])
    siteEUI = float(row["Site EUI (kBtu/ft²)"])
    sqft = float(row["Property GFA - Self-reported (ft²)"])
    
    if (bbl in centroidMapping) and (bbl in plutoNumFloorMapping):    
        lat, lon = centroidMapping[bbl]
        numFloors = plutoNumFloorMapping[bbl]
        
        cdd, hdd = getDegreeDays(lat,lon)
        
        tempLatLons[(lat,lon)] = (cdd, hdd)
        
        if propertyTypeMapping[primaryPropertyType] != -1 and siteEUI>0 and sqft>0:
            valData.append((sqft, cdd, hdd, numFloors))
            valTargets.append((siteEUI*sqft))
            valClassVals.append(propertyTypeMapping[primaryPropertyType])
            valBBLs.append(bbl)

            
valData = np.array(valData)
valTargets = np.array(valTargets)
valClassVals = np.array(valClassVals)
valBBLs = np.array(valBBLs)

print "Classes in this dataset: ", len(set(valClassVals))

oneHotClasses, uniqueVals = CBECSLib.doOneHot(valClassVals, uniqueVals=sorted(CBECSLib.pbaLabels.keys()), returnNames=True)
X_val = np.hstack([valData,oneHotClasses])
Y_val = np.log10(valTargets)
print X_val.shape

Classes in this dataset:  17
20 classes
(2612, 24)


In [12]:
np.save("output/nyc/ll84_X_2016.npy", X_val)
np.save("output/nyc/ll84_Y_2016.npy", Y_val)
np.save("output/nyc/ll84_classVals_2016.npy", valClassVals)
np.save("output/nyc/ll84_bbls_2016.npy", valBBLs)