In [3]:
import pandas as pd
import numpy as np
import geopandas as gp
import pickle

In [4]:
# global variables for data pathfiles

FDNY_RAW = "raw_data/Incidents_Responded_to_by_Fire_Companies.csv"
NYC_ZIPS = 'raw_data/NYC_ZIPS/ZIP_CODE_040114.shp'
PLUTO_BK = 'raw_data/PLUTO/Brooklyn/BKMapPLUTO.shp'
PLUTO_BX = 'raw_data/PLUTO/Bronx/BXMapPLUTO.shp'
PLUTO_MN = 'raw_data/PLUTO/Manhattan/MNMapPLUTO.shp'
PLUTO_QN = 'raw_data/PLUTO/Queens/QNMapPLUTO.shp'
PLUTO_SI = 'raw_data/PLUTO/Staten_Island/SIMapPLUTO.shp'
MASTER_PLUTO_PICKLE = 'processed_data/master_pluto.pickle'
DOB_COMPLAINTS = 'raw_data/DOB_Complaints_Received.csv'
DOB_ECB = 'raw_data/DOB_ECB_Violations.csv'
DOB_VIOLATIONS = 'raw_data/DOB_Violations.csv'
DOB_PERMITS = 'raw_data/Historical_DOB_Permit_Issuance.csv'
PAD = 'raw_data/PAD/bobaadr.txt'

### Import raw data

FDNY data

In [5]:
# import FDNY data
fdny = pd.read_csv(FDNY_RAW,usecols=['IM_INCIDENT_KEY',
                                     'INCIDENT_TYPE_DESC','ZIP_CODE'],
                   dtype={'ZIP_CODE':str})

NYC zip code shapefiles

In [6]:
# import NYC zipcode shapefiles
nyc_zips = gp.read_file(NYC_ZIPS)

NYC PLUTO (2015)

In [7]:
# ***********************
# NOTE: Since Geopandas does not allow filtering select columns, 
# for the first time only: load PLUTO, merge, 
# and select columns and then create pickle file
# Then just load pickle file.
# ***********************
# import PLUTO for 5 boros

# BK = gp.read_file(PLUTO_BK)
# BX = gp.read_file(PLUTO_BX)
# MN = gp.read_file(PLUTO_MN)
# QN = gp.read_file(PLUTO_QN)
# SI = gp.read_file(PLUTO_SI)

In [8]:
# ***********************
# only necessary first time, then pickle is created
# ***********************
# merge 5 boro PLUTO datasets 

# pluto_agg = BK.append(BX)
# pluto_agg = pluto_agg.append(MN)
# pluto_agg = pluto_agg.append(QN)
# pluto_agg = pluto_agg.append(SI)

In [9]:
# ***********************
# only necessary first time, then pickle is created
# ***********************
# select key columns

# pluto_select = pluto_agg[['ZipCode',
# 'BldgClass',
# 'LandUse',
# 'BldgArea',
# 'ComArea',
# 'ResArea',
# 'OfficeArea',
# 'RetailArea',
# 'UnitsRes',
# 'UnitsTotal',
# 'AssessTot',
# 'YearBuilt',
# 'BuiltFAR','LotArea']]

In [10]:
# ***********************
# only necessary first time, then pickle is created
# **********************
# create pickle

# with open(MASTER_PLUTO_PICKLE, 'wb') as handle:
#     pickle.dump(pluto_select, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
# load pickle of PLUTO data
with open(MASTER_PLUTO_PICKLE, 'rb') as handle:
    master_pluto = pickle.load(handle)

DOB and ECB permits and violations

In [12]:
# DOB complaints
dob_complaints = pd.read_csv(DOB_COMPLAINTS,usecols=['Complaint Number', 'Date Entered', 
                                 'BIN', 'Complaint Category', 
                                 'Disposition Date','Disposition Code', 
                                 'Inspection Date'])

In [13]:
# DOB violations 
dob_violations = pd.read_csv(DOB_VIOLATIONS,usecols=['BIN','ISSUE_DATE',
                                                    'VIOLATION_TYPE_CODE',
                                                    'DISPOSITION_DATE',
                                                    'DISPOSITION_COMMENTS',
                                                    'DESCRIPTION',
                                                    'ECB_NUMBER',
                                                    'VIOLATION_CATEGORY',
                                                    'VIOLATION_TYPE'])

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
# ECB violations
ecb = pd.read_csv(DOB_ECB,usecols=['BIN','SEVERITY','VIOLATION_TYPE',
                                   'VIOLATION_DESCRIPTION',
                                   'INFRACTION_CODE1',
                                   'SECTION_LAW_DESCRIPTION1'])

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
# DOB work permits
permits = pd.read_csv(DOB_PERMITS,usecols=['Zip Code','Permit Type',
                                           'Oil Gas','Issuance Date'])

### Filter, aggregate and scale data

- Filter FDNY by gas leaks
- Aggregate FDNY by zip codes
- Preprocess and scale PLUTO attributes
- Aggregate PLUTO data by zip
- Preprocess and scale building data attributes
- Aggregate building data by zip 

#### Filter FDNY for gas leaks and aggregate by zip

In [16]:
# split incident description to get code, filter
def code_split(data):
    a = data.split(' -')
    return a[0]

fdny['incident_code'] = fdny.INCIDENT_TYPE_DESC.apply(lambda x: code_split(x))
fdny_gas = fdny[fdny.incident_code=='412']

# clean FDNY zip data and aggregate by zip
fdny_gas_zip = pd.DataFrame(fdny_gas.groupby('ZIP_CODE')[
        'IM_INCIDENT_KEY'].count()).reset_index()

#### Preprocess and scale PLUTO attributes

In [17]:
# convert PLUTO zip to string
master_pluto['ZipCode'] = master_pluto['ZipCode'].astype(str)

In [18]:
# calculate building age
def year_calc(data):
    if (data < 1800) | (data > 2016):
        return float('NaN')
    else:
        return 2017-data

master_pluto['age'] = master_pluto.YearBuilt.apply(lambda x: year_calc(x))

In [19]:
def scale_and_group_zip(data,zip_col_name,field,multiple=True,
                        header_prefix=None,dispose=False,og=False):
    '''Creates a general "group by" and scaling function that:
    - groups data for a given variable category in an input dataframe by zip code
    - then creates a ratio of each variable category in the zip for all values in zip
    - produces a new sparse matrix with rows=zip codes and cols=each category of the variable,
    where the values are the ratio of the category / all instances in the zip
    '''
    # group selected variable by zip code
    if multiple:
        if og:
            data2 = data.copy()
            data2[field] = data2[field].fillna('NA')
            data = data2.copy()
        
        temp_df = data.groupby([zip_col_name,field])[
            field].count().unstack(level=-1).reset_index()
        
        # create df of ratio of select category of variable per all instances in zip
        zip_matrix = pd.DataFrame()

        for i in range(len(temp_df[zip_col_name])):
            zip_matrix[str(
                temp_df[zip_col_name][i])] = temp_df.T[i][1:]/temp_df.T[i][1:].sum()
    
        zip_matrix = zip_matrix.T.reset_index()

        zip_matrix['ZipCode'] = zip_matrix['index'].astype(str)
        if og:
            zip_matrix = zip_matrix.drop(['index','NA'],axis=1)
        else:
            zip_matrix = zip_matrix.drop('index',axis=1)
        
        if dispose:
            zip_matrix = zip_matrix.rename(columns={zip_matrix.columns[0]:'No_disposition'})
        
        # update header to specific source data (for less confusion when merging data later)
        if header_prefix:
            new_columns = []
            for col in zip_matrix.columns:
                if col != 'ZipCode':
                    new_columns.append(header_prefix+col)
                else:
                    new_columns.append(col)
            zip_matrix.columns = new_columns
            
        return zip_matrix       
    
    else:
    
        return pd.DataFrame((data.groupby(zip_col_name)[field].sum())/data.groupby(
                zip_col_name)[field].count()).reset_index()

In [20]:
# groupby zip - average age
avg_bldg_age_by_zip = scale_and_group_zip(master_pluto,'ZipCode','age',multiple=False)

In [21]:
# groupby zip - # building class in zip / total building in zip
bldgclass_by_zip = scale_and_group_zip(master_pluto,'ZipCode','BldgClass',
                                       multiple=True,header_prefix='bldg_class_')

In [22]:
# groupby zip - # landuse in zip / total land uses in zip
landuse_by_zip = scale_and_group_zip(master_pluto,'ZipCode','LandUse',
                                       multiple=True,header_prefix='landuse_')

In [23]:
# the following assorted pluto attributes are aggregated by zip
pluto_attrib_by_zip = pd.DataFrame()

# function to create ratio of given PLUTO category per zip code
def pluto_attributes_zip(data,zip_col_name,oldfield,newfield,denominator='BldgArea'):
    pluto_attrib_by_zip[newfield] = data.groupby(
        zip_col_name)[oldfield].sum()*1.0/data.groupby(zip_col_name)[denominator].sum()

In [24]:
# commercial ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','ComArea','com_ratio',denominator='BldgArea')

# residential ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','ResArea','res_ratio',denominator='BldgArea')

# office ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','OfficeArea','office_ratio',denominator='BldgArea')

# retail ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','RetailArea','retail_ratio',denominator='BldgArea')

# res / total units by zip code
pluto_attributes_zip(master_pluto,'ZipCode','UnitsRes','res_unit_ratio',denominator='UnitsTotal')

# mean unit area by zip code
pluto_attributes_zip(master_pluto,'ZipCode','BldgArea','unit_area',denominator='UnitsTotal')

# assessed value per sq foot
pluto_attributes_zip(master_pluto,'ZipCode','AssessTot','value_per_ft',denominator='LotArea')

In [25]:
# total units by zip code
pluto_attrib_by_zip['total_units'] = master_pluto.groupby('ZipCode')['UnitsTotal'].sum()

# reset index
pluto_attrib_by_zip = pluto_attrib_by_zip.reset_index()

#### Preprocess and scale the DOB and ECB data

In [26]:
# merge BIN with zipcodes pulled from another dataset
pad = pd.read_csv(PAD,usecols=['bin','zipcode'])
# zip to int
def zipint(data):
    try:
        return str(int(data))
    except ValueError:
        return float('NaN')
pad['zipcode'] = pad.zipcode.apply(lambda x: zipint(x))
pad = pad.rename(columns={'bin':'BIN'})
pad = pad[~pad.zipcode.isnull()]

In [27]:
# merge zip to dob/ecb datasets
dob_complaints = dob_complaints.merge(pad,how='left',on='BIN')
dob_violations = dob_violations.merge(pad,how='left',on='BIN')
ecb = ecb.merge(pad,how='left',on='BIN')

In [28]:
# same as above, group various DOB / ECB datasets to zip and scale

# groupby zip - # complaint category in zip / total complaints in zip
complaints_by_zip = scale_and_group_zip(dob_complaints,'zipcode','Complaint Category',
                                       multiple=True,header_prefix='DOB_complaint_')

# groupby zip - # disposition code in zip / total dispositions in zip
disposition_by_zip = scale_and_group_zip(dob_complaints,'zipcode','Disposition Code',
                                       multiple=True,header_prefix='DOB_dispos_',dispose=True)

# groupby zip - # dob violations type in zip / total violations in zip
violations_by_zip = scale_and_group_zip(dob_violations,'zipcode','VIOLATION_TYPE',
                                       multiple=True,header_prefix='DOB_violation_')

# groupby zip - # ecb violation type in zip / total violations in zip
ecb_violations_by_zip = scale_and_group_zip(ecb,'zipcode','VIOLATION_TYPE',
                                       multiple=True,header_prefix='ECB_violation_')

# groupby zip - # ecb violation type in zip / total violations in zip
ecb_infractions_by_zip = scale_and_group_zip(ecb,'zipcode','INFRACTION_CODE1',
                                       multiple=True,header_prefix='ECB_infraction_')

# groupby zip - # permit type in zip / total permits in zip
permit_by_zip = scale_and_group_zip(permits,'Zip Code','Permit Type',
                                       multiple=True,header_prefix='DOB_permit_')

# groupby zip - # oil or gas permits in zip / total permits in zip
oil_gas_permit_by_zip = scale_and_group_zip(permits,'Zip Code','Oil Gas',
                                       multiple=True,header_prefix='DOB_permit_',og=True)

Preprocess the NYC zipcode shapefiles

In [29]:
# remove duplicate zips (just keeping first listed)
index_list = []
for i in nyc_zips.ZIPCODE:
    temp_index = nyc_zips.ZIPCODE[nyc_zips.ZIPCODE==i].index.tolist()

    if len(temp_index)>1:
        index_list += temp_index[1:]

index_list = set(index_list)

zip_t = nyc_zips.T

zip_drop = zip_t.drop(index_list,axis=1)

nyc_zips_set = zip_drop.T

### Merge datasets

In [30]:
# merge gas
fdny_gas_zip['ZipCode'] = fdny_gas_zip['ZIP_CODE'].astype(str)
fdny_gas_zip['target.gas_incidents'] = fdny_gas_zip['IM_INCIDENT_KEY']
fdny_gas_zip_2 = fdny_gas_zip.drop(['ZIP_CODE','IM_INCIDENT_KEY'],axis=1)

Note, since FDNY zips are a subset of PLUTO zips, merging all PLUTO first, then performing left-join of FDNY on PLUTO

In [31]:
# merge PLUTO datasets

# age and building class
merged_pluto = avg_bldg_age_by_zip.merge(bldgclass_by_zip,how='left',on='ZipCode')

# merge land use
merged_pluto = merged_pluto.merge(landuse_by_zip,how='left',on='ZipCode')

# merge remaing PLUTO attributes
merged_pluto = merged_pluto.merge(pluto_attrib_by_zip,how='left',on='ZipCode')

# merge with FDNY
pluto_fdny = merged_pluto.merge(fdny_gas_zip_2, how = 'left',on='ZipCode')

In [32]:
# merge DOB / ECB data

# dob complaints
pluto_fdny_dob = pluto_fdny.merge(complaints_by_zip,how='left',on='ZipCode')

# dob dispositions
pluto_fdny_dob = pluto_fdny_dob.merge(disposition_by_zip,how='left',on='ZipCode')

# dob violations
pluto_fdny_dob = pluto_fdny_dob.merge(violations_by_zip,how='left',on='ZipCode')

# ecb violations
pluto_fdny_dob = pluto_fdny_dob.merge(ecb_violations_by_zip,how='left',on='ZipCode')

# ecb infractions
pluto_fdny_dob = pluto_fdny_dob.merge(ecb_infractions_by_zip,how='left',on='ZipCode')

# dob permit type
pluto_fdny_dob = pluto_fdny_dob.merge(permit_by_zip,how='left',on='ZipCode')

# dob oil or gas permit
pluto_fdny_dob = pluto_fdny_dob.merge(oil_gas_permit_by_zip,how='left',on='ZipCode')

In [33]:
# merge zip shapefiles
nyc_zips_set['ZipCode'] = nyc_zips_set['ZIPCODE'].astype(str)
nyc_zips_clean = nyc_zips_set[['ZipCode','geometry','POPULATION','AREA']].copy()

all_merged = pluto_fdny_dob.merge(nyc_zips_clean,how='left',on='ZipCode')

master_merged = all_merged.set_index('ZipCode',drop=True)

### OUTPUT: final merged 

#### FDNY
- Merge with zip code shapefile

#### PLUTO features
- Avg building age per zipcode
- Ratio of each building class per zip code
- Ratio of each land use per zip code
- Building use ratio (commercial, residential, office, retail) per zip code
- Residential unit density per zip code
- Ave Unit area per zip code
- Value per ft per zip code
- Total units per zip code

##### DOB/ECB reatures
- Ratio of each DOB complaint type per zip code
- Ratio of each DOB complaint disposition per zip code
- Ratio of each DOB violation type per zip code
- Ratio of each ECB violation type per zip code
- Ratio of each DOB work permit type per zip code
- Ratio of oil or gas permits out of all permits per zip code

In [32]:
master_merged.to_csv('processed_data/master_merged.csv',index=False)