# This loads and integrates PLUTO, DOB/ECB, and ACS data and aggregates to the census tract level. It truncates the data to include 2014 data, which impacts DOB/ECB datasets.

# Use this output of tracts by features (3180 x 718 matrix) to cross-validate with gas leaks as trained on 2013 data. Use features to predict 2015 gas leaks for final model evaluation.

In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import pickle
import datetime
import os

In [2]:
# global variables for data pathfiles

FDNY_TRACT_13 = 'processed_data/tract_output/tract_incidents_2013.csv'
FDNY_TRACT_14 = 'processed_data/tract_output/tract_incidents_2014.csv'
FDNY_TRACT_15 = 'processed_data/tract_output/tract_incidents_2015.csv'
PLUTO_BK = 'raw_data/PLUTO/Brooklyn/BKMapPLUTO.shp'
PLUTO_BX = 'raw_data/PLUTO/Bronx/BXMapPLUTO.shp'
PLUTO_MN = 'raw_data/PLUTO/Manhattan/MNMapPLUTO.shp'
PLUTO_QN = 'raw_data/PLUTO/Queens/QNMapPLUTO.shp'
PLUTO_SI = 'raw_data/PLUTO/Staten_Island/SIMapPLUTO.shp'
MASTER_PLUTO_PICKLE_BBL = 'processed_data/master_pluto_bbl.pickle'
DOB_COMPLAINTS = 'raw_data/DOB_Complaints_Received.csv'
DOB_ECB = 'raw_data/DOB_ECB_Violations.csv'
DOB_VIOLATIONS = 'raw_data/DOB_Violations.csv'
DOB_PERMITS = 'raw_data/Historical_DOB_Permit_Issuance.csv'
BIN_BBL = 'raw_data/building_0117.csv'
PAD = 'raw_data/PAD/bobaadr.txt'
CENSUS_TRACT_RACE = 'raw_data/CENSUS_TRACT_RACE_INCOME/ACS_15_5YR_DP05_with_ann.csv'
CENSUS_TRACT_INCOME = 'raw_data/CENSUS_TRACT_RACE_INCOME/ACS_15_5YR_S1901_with_ann.csv'
NYC_TRACTS = 'raw_data/nyc_tract/nyct2010.shp'
ZIP_TRACTS = 'raw_data/zip_tract_122015.csv'

### Import raw data

FDNY data per tract -- taken from an earlier spatial join with street polygons
- 2014 FDNY data

In [3]:
# import FDNY data
fdny = pd.read_csv(FDNY_TRACT_14)
fdny['TRACT'] = fdny['GEOID'].astype(str)
del fdny['GEOID']

NYC zip code and census tract shapefiles

In [4]:
# import NYC census tract shapefiles
nyc_tracts = gp.read_file(NYC_TRACTS)

# boro code to TRACT
boro_to_ct = {'1':'36061','2':'36005','3':'36047','4':'36081','5':'36085'}
NYC_st_ct = [v for k,v in boro_to_ct.items()]

def boro2stct(data):
    return boro_to_ct[data]

nyc_tracts['TRACT'] = nyc_tracts['BoroCode'].apply(lambda x: boro2stct(x))+nyc_tracts['CT2010'].astype(str)

# read in zip_tract
zip_tracts = pd.read_csv(ZIP_TRACTS,usecols=['ZIP','TRACT'],dtype={'ZIP':str,'TRACT':str})
zip_tracts.rename(columns = {'ZIP':'ZipCode'},inplace=True)

# merge to get NYC zips only
zip_tracts_nyc = nyc_tracts.merge(zip_tracts,how='left',on='TRACT')

NYC PLUTO (2015)

In [5]:
def import_filter_pluto():
    
    # import PLUTO for 5 boros
    BK = gp.read_file(PLUTO_BK)
    BX = gp.read_file(PLUTO_BX)
    MN = gp.read_file(PLUTO_MN)
    QN = gp.read_file(PLUTO_QN)
    SI = gp.read_file(PLUTO_SI)
    
    # merge 5 boro PLUTO datasets 
    pluto_agg = BK.append(BX)
    pluto_agg = pluto_agg.append(MN)
    pluto_agg = pluto_agg.append(QN)
    pluto_agg = pluto_agg.append(SI)
    
    # select key columns

    pluto_select = pluto_agg[['ZipCode',
    'BBL',
    'Tract2010',
    'BldgClass',
    'LandUse',
    'BldgArea',
    'ComArea',
    'ResArea',
    'OfficeArea',
    'RetailArea',
    'UnitsRes',
    'UnitsTotal',
    'AssessTot',
    'YearBuilt',
    'BuiltFAR','LotArea']]
    
    # create pickle

    with open(MASTER_PLUTO_PICKLE_BBL, 'wb') as handle:
        pickle.dump(pluto_select, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
# ***********************
# NOTE: Since Geopandas does not allow filtering select columns, 
# you'll have to load full PLUTO set, merge, and select columns,
# then save as a pickle file for later use.
# ***********************

if os.path.exists(MASTER_PLUTO_PICKLE_BBL):
    print "File exists. Loading pickle..."
    # load pickle of PLUTO data
    with open(MASTER_PLUTO_PICKLE_BBL, 'rb') as handle:
        master_pluto = pickle.load(handle)
    
else:
    print "File does not yet exist. Importing and filtering PLUTO. This could take several minutes..."
    # first time only, import, filter, and save processed PLUTO as a pickle for future use
    import_filter_pluto()
    
    # load pickle of PLUTO data
    with open(MASTER_PLUTO_PICKLE_BBL, 'rb') as handle:
        master_pluto = pickle.load(handle)

File exists. Loading pickle...


DOB and ECB permits and violations

In [7]:
# list of building bins and BBLs
bin_bbl = pd.read_csv(BIN_BBL,usecols=['BBL','BIN'])

In [8]:
# DOB complaints
dob_complaints = pd.read_csv(DOB_COMPLAINTS,usecols=['Complaint Number', 'Date Entered', 
                                 'BIN', 'Complaint Category', 
                                 'Disposition Date','Disposition Code', 
                                 'Inspection Date'])

# merge BBL to the dob dataset
dob_complaints_bbl = dob_complaints.merge(bin_bbl,how='inner',on='BIN')
dob_complaints_bbl['BBL'] = dob_complaints_bbl['BBL'].astype(int).astype(str)

# remove handful of outlier BBLs
dob_bbl_indx = dob_complaints_bbl['BBL'].apply(lambda x: len(str(x))==10)
dob_complaints_bbl = dob_complaints_bbl[dob_bbl_indx].copy()

In [9]:
# DOB violations 
dob_violations = pd.read_csv(DOB_VIOLATIONS,dtype={'BORO':str,
                                                   'BLOCK':str,'LOT':str,
                                                   'ISSUE_DATE':str,
                                                   'DISPOSITION_DATE':str})

In [10]:
# ECB violations
ecb = pd.read_csv(DOB_ECB,usecols=['BIN','BORO','BLOCK','LOT','SEVERITY','VIOLATION_TYPE',
                                   'VIOLATION_DESCRIPTION',
                                   'INFRACTION_CODE1','ISSUE_DATE',
                                   'SECTION_LAW_DESCRIPTION1'],dtype={'BORO':str,
                                                                      'BLOCK':str,
                                                                      'LOT':str,
                                                                      'ISSUE_DATE':str})

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
# DOB work permits
permits = pd.read_csv(DOB_PERMITS,usecols=['Zip Code','BOROUGH','Block','Lot',
                                           'Bldg Type','Residential','Permit Type',
                                           'Oil Gas','Issuance Date'],dtype={'BOROUGH':str,
                                                                           'Block':str,
                                                                           'Lot':str})

Census Data

In [12]:
# race data
race = pd.read_csv(CENSUS_TRACT_RACE,skiprows=[1],usecols=['GEO.id2', 'HC01_VC03', 
                                                           'HC01_VC49', 'HC01_VC50', 
                                                           'HC01_VC51','HC01_VC56', 
                                                           'HC01_VC64', 'HC01_VC69', 'HC01_VC23'])

# rename columns
race.rename(columns={'HC01_VC03': 'TOTAL_POPULATION', 'HC01_VC49': 'WHITE',
                        'HC01_VC50': 'BLACK_AFRICAN_AMERICAN', 'HC01_VC51': 'AMERICAN_INDIAN_AND_ALASKA_NATIVE',
                        'HC01_VC56': 'ASIAN', 'HC01_VC64': 'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
                        'HC01_VC69': 'SOME_OTHER_RACE', 'HC01_VC23': 'MEDIAN_AGE', 'GEO.id2': 'GEOID'}, inplace=True)

# convert values to float
def make_float(expected_float):
    try:
        if type(expected_float) == str:
            expected_float = expected_float.replace('+', '').replace(',', '')
        return float(expected_float)
    except:
        # print expected_float
        return np.nan

for i in race.columns[race.columns!='GEOID']:
    race[i] = race[i].apply(lambda x: make_float(x))

In [13]:
# income data
income = pd.read_csv(CENSUS_TRACT_INCOME,skiprows=[1],usecols=['GEO.id2', 'HC01_EST_VC01', 'HC01_EST_VC15'])

# rename columns
income.rename(columns={'HC01_EST_VC01': 'TOTAL_HOUSEHOLDS', 
                       'HC01_EST_VC15': 'MEAN_INCOME', 'GEO.id2': 'GEOID'}, inplace=True)

#convert values to float
for i in income.columns[income.columns!='GEOID']:
    income[i] = income[i].apply(lambda x: make_float(x))

### DATE SELECTION: 2014 data (use to cross-validate and predict 2015 gas leaks)
Time-sensitive datasets: FDNY,DOB/ECB

We'll assume the same PLUTO and ACS variables stand for all years.

** Note that the DOB Work Permits data does not extend past 2013, but we're including it given the assumption that historical work permits still apply to buildings existing in 2013-2015 and could be factors. (Exceptions clearly would occur to this assumption). So we will use the same historical data (work permits through 2013) for both training and test data

#### Prep datatime fields

In [14]:
# dob/ecb date conversion
dob_complaints_bbl['date_entered'] = dob_complaints_bbl['Date Entered'].apply(
    lambda x: datetime.datetime.strptime(x,'%m/%d/%Y'))

permits['issuance_date'] = permits['Issuance Date'].apply(
    lambda x: datetime.datetime.strptime(x,'%m/%d/%Y'))

In [15]:
# deal with inconsistent datetime
def dob_date(data):
    try:
        return datetime.datetime.strptime(data, '%Y%m%d')
    except:
        try:
            y,md = data.split('  ')
            if y in ['11','12','13','14','15']:
                data = '20'+y+md
                return datetime.datetime.strptime(data, '%Y%m%d')
        except:
            try:
                data = str(data)[:8]
                return datetime.datetime.strptime(data, '%Y%m%d')
            except:
                return float("NaN")
            
dob_violations['issue_date'] = dob_violations['ISSUE_DATE'].apply(
    lambda x: dob_date(x))

# cut out records with incoherent date format or years before 2013
dob_violations = dob_violations[~dob_violations['issue_date'].isnull()]
dob_violations['issue_date'] = dob_violations['issue_date'].apply(lambda x: x.date())

In [16]:
# deal with inconsistent datetime
def ecb_date(data):
    try:
        return datetime.datetime.strptime(str(data), '%Y%m%d')
    except:
        return float("NaN")

ecb['issue_date'] = ecb['ISSUE_DATE'].apply(
    lambda x: ecb_date(x))

# cut out the 85 records with incoherent date format
ecb = ecb[~ecb['issue_date'].isnull()]   

### Set dates for 2014

In [17]:
# date filter method
def filter_date(data, date_col):
    start_date = datetime.date(2014,1,1)
    end_date = datetime.date(2015,1,1) # not inclusive of end date
        
    # truncate to selected dates
    return data[(data[date_col]>=start_date) & (data[date_col]<end_date)]

In [18]:
# segment data by date
dob_complaints_sliced = filter_date(dob_complaints_bbl,'date_entered')
dob_violations_sliced = filter_date(dob_violations,'issue_date')
ecb_sliced = filter_date(ecb,'issue_date')

### Filter, aggregate and scale data

- Preprocess and scale PLUTO attributes
- Aggregate PLUTO data by census tract
- Preprocess and scale building data attributes
- Aggregate building data by census tract
- Aggregate census tract data by census tract

#### Preprocess and scale PLUTO attributes

In [19]:
# merge bin_bbl with pluto
master_pluto['BBL_int'] = master_pluto['BBL'].astype(int)

# concatenate full TRACT number
master_pluto['st_ct_FIPS'] = master_pluto['BBL'].apply(lambda x: boro2stct(str(x)[:1]))
master_pluto['ctract'] = master_pluto['Tract2010'].apply(lambda x: str(x)+'00' if len(x)==4 else str(x))

master_pluto['TRACT'] = master_pluto['st_ct_FIPS']+master_pluto['ctract']

In [20]:
# calculate building age
# note: to keep equal comparison b/t train and test we're keeping age as of 2013 for both
def year_calc(data):
    if (data < 1800) | (data > 2013):
        return float('NaN')
    else:
        return 2013-data 
    
master_pluto['age'] = master_pluto.YearBuilt.apply(lambda x: year_calc(x))

In [21]:
def scale_and_group_tract(data,tract_col_name,field,multiple=True,
                        header_prefix=None,dispose=False,og=False):
    '''Creates a general "group by" and scaling function that:
    - groups data for a given variable category in an input dataframe by census tract
    - then creates a ratio of each variable category in the tract for all values in tract
    - produces a new sparse matrix with rows=census tracts and cols=each category of the variable,
    where the values are the ratio of the category / all instances in the census tract
    '''
    # group selected variable by tract
    if multiple:
        if og:
            data2 = data.copy()
            data2[field] = data2[field].fillna('NA')
            data = data2.copy()
        
        temp_df = data.groupby([tract_col_name,field])[
            field].count().unstack(level=-1).reset_index()
        
        # create df of ratio of select category of variable per all instances in tract
        tract_matrix = pd.DataFrame()

        for i in range(len(temp_df[tract_col_name])):
            tract_matrix[str(
                temp_df[tract_col_name][i])] = temp_df.T[i][1:]/temp_df.T[i][1:].sum()
    
        tract_matrix = tract_matrix.T.reset_index()

        tract_matrix['TRACT'] = tract_matrix['index'].astype(str)
        if og:
            tract_matrix = tract_matrix.drop(['index','NA'],axis=1)
        else:
            tract_matrix = tract_matrix.drop('index',axis=1)
        
        if dispose:
            tract_matrix = tract_matrix.rename(columns={tract_matrix.columns[0]:'No_disposition'})

        
        # update header to specific source data (for less confusion when merging data later)
        if header_prefix:
            new_columns = []
            for col in tract_matrix.columns:
                if col != 'TRACT':
                    new_columns.append(header_prefix+col)
                else:
                    new_columns.append(col)
            tract_matrix.columns = new_columns
            
        return tract_matrix       
    
    else:
    
        return pd.DataFrame((data.groupby(tract_col_name)[field].sum())/data.groupby(
                tract_col_name)[field].count()).reset_index()

In [22]:
# groupby tract: average age
avg_bldg_age_by_tract = scale_and_group_tract(master_pluto,'TRACT','age',multiple=False)

In [23]:
# groupby tract: # building class in tract / total building in tract
bldgclass_by_tract = scale_and_group_tract(master_pluto,'TRACT','BldgClass',
                                       multiple=True,header_prefix='bldg_class_')

In [24]:
# groupby tract: # landuse in tract / total land uses in tract
landuse_by_tract = scale_and_group_tract(master_pluto,'TRACT','LandUse',
                                       multiple=True,header_prefix='landuse_')

In [25]:
# the following assorted pluto attributes are aggregated by zip
pluto_attrib_by_tract = pd.DataFrame()

# function to create ratio of given PLUTO category per zip code
def pluto_attributes_tract(data,tract_col_name,oldfield,newfield,denominator='BldgArea'):
    pluto_attrib_by_tract[newfield] = data.groupby(
        tract_col_name)[oldfield].sum()*1.0/data.groupby(tract_col_name)[denominator].sum()

In [26]:
# commercial ratio by tract
pluto_attributes_tract(master_pluto,'TRACT','ComArea','com_ratio',denominator='BldgArea')

# residential ratio by tract
pluto_attributes_tract(master_pluto,'TRACT','ResArea','res_ratio',denominator='BldgArea')

# office ratio by tract
pluto_attributes_tract(master_pluto,'TRACT','OfficeArea','office_ratio',denominator='BldgArea')

# retail ratio by tract
pluto_attributes_tract(master_pluto,'TRACT','RetailArea','retail_ratio',denominator='BldgArea')

# res / total units by tract
pluto_attributes_tract(master_pluto,'TRACT','UnitsRes','res_unit_ratio',denominator='UnitsTotal')

# mean unit area by tract
pluto_attributes_tract(master_pluto,'TRACT','BldgArea','unit_area',denominator='UnitsTotal')

# assessed value per sq foot
pluto_attributes_tract(master_pluto,'TRACT','AssessTot','value_per_ft',denominator='LotArea')

In [27]:
# total units by zip code
pluto_attrib_by_tract['total_units'] = master_pluto.groupby('TRACT')['UnitsTotal'].sum()

# reset index
pluto_attrib_by_tract = pluto_attrib_by_tract.reset_index()

#### Preprocess and scale the DOB and ECB data

In [28]:
# add zeros to even out block and code
def add_zero(data):
    if len(str(data))==4:
        return '0'+str(data)
    if len(str(data))==3:
        return '00'+str(data)
    elif len(str(data))==2:
        return '000'+str(data)
    elif len(str(data))==1:
        return '0000'+str(data)
    else:
        return data

def add_zero_lt(data):
    if len(str(data))==5:
        return str(data)[1:]
    else:
        return data

def st_clip(data):
    if len(str(data))>1:
        return str(data)[:1]
    else:
        return str(data)    
    
dob_violations_sliced['block_'] = dob_violations_sliced['BLOCK'].apply(lambda x: add_zero(x))
dob_violations_sliced['lot_'] = dob_violations_sliced['LOT'].apply(lambda x: add_zero_lt(x))

ecb_sliced['BORO'] = ecb_sliced['BORO'].apply(lambda x: st_clip(x))
ecb_sliced['block_'] = ecb_sliced['BLOCK'].apply(lambda x: add_zero(x))
ecb_sliced['lot_'] = ecb_sliced['LOT'].apply(lambda x: add_zero_lt(x))

# concatenate BBL
dob_violations_sliced['BBL'] = dob_violations_sliced['BORO'].astype(
    str)+dob_violations_sliced['BLOCK'].astype(str)+dob_violations_sliced['lot_'].astype(str)

ecb_sliced['BBL'] = ecb_sliced['BORO'].astype(
    str)+ecb_sliced['BLOCK'].astype(str)+ecb_sliced['lot_'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [29]:
# permits dataset
BOROUGH_to_ct = {'MANHATTAN':'1','BRONX':'2','BROOKLYN':'3','QUEENS':'4','STATEN ISLAND':'5'}

permits['Boro'] = permits['BOROUGH'].apply(lambda x: BOROUGH_to_ct[x])

permits['block_'] = permits['Block'].apply(lambda x: add_zero(x))
permits['lot_'] = permits['Lot'].apply(lambda x: add_zero_lt(x))

permits['BBL'] = permits['Boro'].astype(
    str)+permits['block_'].astype(str)+permits['lot_'].astype(str)

In [30]:
# zip to bbl
zip_bbl = master_pluto[['TRACT','BBL']].copy()
zip_bbl['BBL'] = zip_bbl['BBL'].astype(int).astype(str)

# merge DOB/ECB to PLUTO
dob_complaints_tract = dob_complaints_sliced.merge(zip_bbl,how='inner',on='BBL')

ecb_tract = ecb_sliced.merge(zip_bbl,how='inner',on='BBL')

dob_violations_tract = dob_violations_sliced.merge(zip_bbl,how='inner',on='BBL')

permits_tract = permits.merge(zip_bbl,how='inner',on='BBL')

Pre-process DOB/ECB data as with the PLUTO data (group by tract and scale)

In [31]:
# same as above, group various DOB / ECB datasets to tract and scale

# groupby tract: # complaint category in tract / total complaints in tract
complaints_by_tract = scale_and_group_tract(dob_complaints_tract,'TRACT','Complaint Category',
                                       multiple=True,header_prefix='DOB_complaint_')

# groupby tract: # disposition code in tract / total dispositions in tract
disposition_by_tract = scale_and_group_tract(dob_complaints_tract,'TRACT','Disposition Code',
                                       multiple=True,header_prefix='DOB_dispos_',dispose=True)

# groupby tract:  # dob violations type in tract / total violations in tract
violations_by_tract = scale_and_group_tract(dob_violations_tract,'TRACT','VIOLATION_TYPE',
                                       multiple=True,header_prefix='DOB_violation_')

# groupby tract: # ecb violation type in tract / total violations in tract
ecb_violations_by_tract = scale_and_group_tract(ecb_tract,'TRACT','VIOLATION_TYPE',
                                      multiple=True,header_prefix='ECB_violation_')

# groupby tract: # ecb violation type in tract / total violations in tract
ecb_infractions_by_tract = scale_and_group_tract(ecb_tract,'TRACT','INFRACTION_CODE1',
                                       multiple=True,header_prefix='ECB_infraction_')

# groupby tract: # permit type in tract / total permits in tract
permits_by_tract = scale_and_group_tract(permits_tract,'TRACT','Permit Type',
                                       multiple=True,header_prefix='DOB_permit_')

# groupby tract: # oil or gas permits in tract / total permits in tract
oil_gas_permits_by_tract = scale_and_group_tract(permits_tract,'TRACT','Oil Gas',
                                       multiple=True,header_prefix='DOB_permit_',og=True)

Pre-process census data and aggregate by census tract

In [32]:
# merge race and income
census = race.merge(income,how='outer',on='GEOID')

census['TRACT'] = census.GEOID.apply(lambda x: str(x))

# demo data by tract: race population / total population of tract
for col in census.columns[3:9]:
    census[col+'_ratio'] = census[col]*1.0/census['TOTAL_POPULATION']
    del census[col]

### Merge datasets

In [33]:
# merge PLUTO datasets

# age and building class
merged_pluto = avg_bldg_age_by_tract.merge(bldgclass_by_tract,how='left',on='TRACT')

# merge land use
merged_pluto = merged_pluto.merge(landuse_by_tract,how='left',on='TRACT')

# merge remaing PLUTO attributes
merged_pluto = merged_pluto.merge(pluto_attrib_by_tract,how='left',on='TRACT')

# merge with FDNY
pluto_fdny = merged_pluto.merge(fdny,how='left',on='TRACT')

In [34]:
# merge DOB / ECB data

# dob complaints
pluto_fdny_dob = pluto_fdny.merge(complaints_by_tract,how='left',on='TRACT')

# dob dispositions
pluto_fdny_dob = pluto_fdny_dob.merge(disposition_by_tract,how='left',on='TRACT')

# dob violations
pluto_fdny_dob = pluto_fdny_dob.merge(violations_by_tract,how='left',on='TRACT')

# ecb violations
pluto_fdny_dob = pluto_fdny_dob.merge(ecb_violations_by_tract,how='left',on='TRACT')

# ecb infractions
pluto_fdny_dob = pluto_fdny_dob.merge(ecb_infractions_by_tract,how='left',on='TRACT')

# dob permit type
pluto_fdny_dob = pluto_fdny_dob.merge(permits_by_tract,how='left',on='TRACT')

# dob oil or gas permit
pluto_fdny_dob = pluto_fdny_dob.merge(oil_gas_permits_by_tract,how='left',on='TRACT')

In [40]:
# merge census data
pluto_fdny_dob_census = pluto_fdny_dob.merge(census,how='left',on='TRACT')

In [41]:
# merge spatial data
nyc_spatial = zip_tracts_nyc[['NTACode','NTAName','geometry','TRACT','ZipCode']]

master_merged = pluto_fdny_dob_census.merge(nyc_spatial,how='left',on='TRACT')

# move target (i.e. dependent) variable to last column and rename "total_gas_incidents"
master_merged['total_gas_incidents'] = master_merged['total_gas_incidents_yr'].astype(float)
 
del master_merged['total_gas_incidents_yr']

# add column gas incidents per building unit
master_merged['gas_incidents_per_bldg_unit'] = master_merged[
    'total_gas_incidents']*1.0/master_merged['total_units']

In [42]:
master_merged.to_csv('processed_data/pluto_fdny_dob_census_to_tract_2014.csv',index=False)

### OUTPUT: final merged 

#### PLUTO features
- Avg building age per census tract
- Ratio of each building class per census tract
- Ratio of each land use per census tract
- Building use ratio (commercial, residential, office, retail) per census tract
- Residential unit density per census tract
- Ave Unit area per census tract
- Value per ft per census tract
- Total units per census tract

#### DOB/ECB features
- Ratio of each DOB complaint type per census tract
- Ratio of each DOB complaint disposition per census tract
- Ratio of each DOB violation type per census tract
- Ratio of each ECB violation type per census tract
- Ratio of each DOB work permit type per census tract
- Ratio of oil or gas permits out of all permits per census tract

#### Census data features
- Total population per census tract
- Total households per census tract
- Mean income census tract
- Ratio of various racial groups out of total population per census tract