In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import pickle
import geocoder
import datetime
import re
import os
import shapely
import fiona
import rtree

In [2]:
# global variables for data pathfiles

FDNY_RAW = "raw_data/Incidents_Responded_to_by_Fire_Companies.csv"
NYC_ZIPS = 'raw_data/NYC_ZIPS/ZIP_CODE_040114.shp'
PLUTO_BK = 'raw_data/PLUTO/Brooklyn/BKMapPLUTO.shp'
PLUTO_BX = 'raw_data/PLUTO/Bronx/BXMapPLUTO.shp'
PLUTO_MN = 'raw_data/PLUTO/Manhattan/MNMapPLUTO.shp'
PLUTO_QN = 'raw_data/PLUTO/Queens/QNMapPLUTO.shp'
PLUTO_SI = 'raw_data/PLUTO/Staten_Island/SIMapPLUTO.shp'
MASTER_PLUTO_PICKLE = 'processed_data/master_pluto.pickle'
DOB_COMPLAINTS = 'raw_data/DOB_Complaints_Received.csv'
DOB_ECB = 'raw_data/DOB_ECB_Violations.csv'
DOB_VIOLATIONS = 'raw_data/DOB_Violations.csv'
DOB_PERMITS = 'raw_data/Historical_DOB_Permit_Issuance.csv'
PAD = 'raw_data/PAD/bobaadr.txt'
NYC_TRACTS = 'raw_data/nyc_tract/nyct2010.shp'
STREET_DIRECTORY_IN = 'raw_data/Streets_shp/'
STREET_FILENAME_IN = 'StreetSegment.shp'
STREET_DIRECTORY_OUT = 'processed_data/Streets_shp/'
STREET_FILENAME_OUT = 'nycLeftStreetZip.shp'

### Import raw data

FDNY data

In [3]:
# import FDNY data
fdny = pd.read_csv(FDNY_RAW,usecols=['IM_INCIDENT_KEY','INCIDENT_TYPE_DESC',
                                'ZIP_CODE','STREET_HIGHWAY', 'BOROUGH_DESC'],
                   dtype={'ZIP_CODE':str})

NYC zip code shapefiles

In [4]:
# import NYC zipcode shapefiles
nyc_zips = gp.read_file(NYC_ZIPS)

# import NYC census tract shapefiles
nyc_tracts = gp.read_file(NYC_TRACTS)

NYC PLUTO (2015)

In [5]:
# ***********************
# NOTE: Since Geopandas does not allow filtering select columns, 
# for the first time only: load PLUTO, merge, 
# and select columns and then create pickle file
# Then just load pickle file.
# ***********************
# import PLUTO for 5 boros

# BK = gp.read_file(PLUTO_BK)
# BX = gp.read_file(PLUTO_BX)
# MN = gp.read_file(PLUTO_MN)
# QN = gp.read_file(PLUTO_QN)
# SI = gp.read_file(PLUTO_SI)

In [6]:
# ***********************
# only necessary first time, then pickle is created
# ***********************
# merge 5 boro PLUTO datasets 

# pluto_agg = BK.append(BX)
# pluto_agg = pluto_agg.append(MN)
# pluto_agg = pluto_agg.append(QN)
# pluto_agg = pluto_agg.append(SI)

In [7]:
# ***********************
# only necessary first time, then pickle is created
# ***********************
# select key columns

# pluto_select = pluto_agg[['ZipCode',
# 'BBL',
# 'BldgClass',
# 'LandUse',
# 'BldgArea',
# 'ComArea',
# 'ResArea',
# 'OfficeArea',
# 'RetailArea',
# 'UnitsRes',
# 'UnitsTotal',
# 'AssessTot',
# 'YearBuilt',
# 'BuiltFAR',
# 'geometry',
# 'LotArea']]

In [8]:
# ***********************
# only necessary first time, then pickle is created
# **********************
# create pickle

# with open(MASTER_PLUTO_PICKLE, 'wb') as handle:
#     pickle.dump(pluto_select, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
# load pickle of PLUTO data
with open(MASTER_PLUTO_PICKLE, 'rb') as handle:
    master_pluto = pickle.load(handle)

DOB and ECB permits and violations

In [10]:
# DOB complaints
dob_complaints = pd.read_csv(DOB_COMPLAINTS,usecols=['Complaint Number', 'Date Entered', 
                                 'BIN', 'Complaint Category', 
                                 'Disposition Date','Disposition Code', 
                                 'Inspection Date'])

In [11]:
dob_violations = pd.read_csv(DOB_VIOLATIONS)

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
# ECB violations
ecb = pd.read_csv(DOB_ECB,usecols=['BIN','BORO','BLOCK','LOT','SEVERITY','VIOLATION_TYPE',
                                   'VIOLATION_DESCRIPTION',
                                   'INFRACTION_CODE1','ISSUE_DATE',
                                   'SECTION_LAW_DESCRIPTION1'])

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
# DOB work permits
permits = pd.read_csv(DOB_PERMITS,usecols=['Zip Code','BOROUGH','Block','Lot',
                                           'Bldg Type','Residential','Permit Type',
                                           'Oil Gas','Issuance Date'])

  interactivity=interactivity, compiler=compiler, result=result)


### Filter, aggregate and scale data

- Filter FDNY by gas leaks
- Aggregate FDNY by zip codes
- Preprocess and scale PLUTO attributes
- Aggregate PLUTO data by zip
- Preprocess and scale building data attributes
- Aggregate building data by zip 

#### Filter FDNY for gas leaks

In [14]:
# split incident description to get code, filter
def code_split(data):
    a = data.split(' -')
    return a[0]

# clean street data
fdny['BOROUGH_DESC'] = fdny['BOROUGH_DESC'].apply(
    lambda x: x.split('-', 1)[1]).str.replace('Manhattan', 'New York')
fdny['BOROUGH_DESC'] = fdny['BOROUGH_DESC'].str.replace('Brooklyn', 'Kings')
fdny['BOROUGH_DESC'] = fdny['BOROUGH_DESC'].str.replace('Staten Island', 'RICHMOND')
fdny['ADDRESS_W_ZIP'] = fdny['STREET_HIGHWAY'] + ","+ fdny[
    'BOROUGH_DESC'] + ', '  + fdny['ZIP_CODE']

# filter for gas leaks
fdny['incident_code'] = fdny.INCIDENT_TYPE_DESC.apply(lambda x: code_split(x))
fdny_gas = fdny[fdny.incident_code=='412'].copy()

#### Load street polygons and filter for NYC zips

In [15]:
def streets():
    # open your file with geopandas
    all_streets = gp.GeoDataFrame.from_file(STREET_DIRECTORY_IN + STREET_FILENAME_IN)

    nyc_counties = ['Queens', 'New York', 'Richmond', 'Bronx', 'Kings']

    streets = all_streets.loc[:,['LeftCounty', 'RightCount', 
                                 'LeftCensus', 'RightCensu', 'LeftPostal', 
                                 'RightPosta', 'Label', 'geometry']]

    streets['LeftPostal'] = streets['LeftPostal'].astype(str)
    streets['RightPosta'] = streets['RightPosta'].astype(str)

    # select all roads where county in NYC or ZIP CODE in NYC
    nyc_streets = streets[(streets['RightCount'].isin(nyc_counties)) | (
            streets['LeftCounty'].isin(nyc_counties)) | (
            streets['LeftPostal'].isin(nyc_zips)) | (streets['RightPosta'].isin(nyc_zips))]

    nyc_streets.loc[:,'LEFT_STREET_NAME'] = nyc_streets.loc[
        :,'Label'].str.upper() + ', ' + nyc_streets.loc[
        :,'LeftCounty'].str.upper() + ', ' + nyc_streets.loc[:,'LeftPostal']
    nyc_streets.loc[:,'RIGHT_STREET_NAME'] = nyc_streets.loc[
        :,'Label'].str.upper() + ', ' + nyc_streets.loc[
        :,'RightCount'].str.upper() + ', ' + nyc_streets.loc[:,'RightPosta']

    # GROUP EACH INTO LIST FOR LOOPING/SHAPEFILE CREATION
    geom = list(nyc_streets['geometry'].values)
    left_str = list(nyc_streets['LEFT_STREET_NAME'].values)
    # right_str = list(nyc_streets['RIGHT_STREET_NAME'].values)

    # dictionaries to hold line segments
    left_streets = {}
    right_streets = {}

    # Disolve roads with same name county and zip to a single feature

    for i in range(len(geom)):
        left_street = left_str[i]
    #     right_street = right_str[i]
        geometry = geom[i]

        # if the feature's street doesn't yet exist, create it and assign a list
        if left_street not in left_streets:
            left_streets[left_street] = []
        # append the feature to the list of features
        left_streets[left_street].append(geometry)


    #     # if the feature's street doesn't yet exist, create it and assign a list
    #     if right_street not in right_streets:
    #         right_streets[right_street] = []
    #     # append the feature to the list of features
    #     right_streets[right_street].append(geometry)

    # create a geopandas geodataframe, with columns for street and geometry
    streets_dissolved = gp.GeoDataFrame(columns=['left_street', 'geometry'], crs=all_streets.crs)

    # iterate your dictionary
    for street, street_list in left_streets.items():
        # create a geoseries from the list of features
        geometry = gp.GeoSeries(street_list)
        # use unary_union to join them, thus returning polygon or multi-polygon
        geometry = geometry.unary_union
        # set your street and geometry values
        streets_dissolved.set_value(street, 'left_street', street)
        streets_dissolved.set_value(street, 'geometry', geometry)

    # save to file
    streets_dissolved.to_file(STREET_DIRECTORY_OUT + STREET_FILENAME_OUT, driver="ESRI Shapefile")

In [16]:
# load streets dissolved
if os.path.exists(STREET_DIRECTORY_OUT + STREET_FILENAME_OUT):
    streets_dissolved = gp.GeoDataFrame.from_file(STREET_DIRECTORY_OUT + STREET_FILENAME_OUT)
else:
    streets()

In [17]:
def replacement(rep, value):
    if value in rep:
        value = rep[value]
    return value

In [18]:
# Clean_address removes extension on numbered street. i.e. 2ND ST turns to 2 ST.
def clean_address(add):
    add = str(add)
    add = re.sub(' +',' ', add)
    string = []
    for i in add.split(' '):
        num = re.sub('[^0-9]','', i)
        if (num != '') & (',' not in i):
            string.append(num)
        else:
            
            string.append(i)
    return " ".join(string).strip()

In [19]:
# Preproc of Address naming covention prior to address cleaning

fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.upper()
fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.replace('ST NICHOLAS', 'SAINT NICHOLAS')
fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.replace('.', '')
fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.replace('ST MARKS', 'SAINT MARKS')


preproc={'BRIGHTON12 ST, KINGS, 11235': 'BRIGHTON 12 ST, KINGS, 11235',
         'BRIGHTON3 PL, KINGS, 11235': 'BRIGHTON 3 PL, KINGS, 11235',
         'BRIGHTON3 ST, KINGS, 11235': 'BRIGHTON 3 ST, KINGS, 11235',
         'BRIGHTON5 ST, KINGS, 11235': 'BRIGHTON 5 ST, KINGS, 11235',
         'BRIGHTON6 ST, KINGS, 11235': 'BRIGHTON 6 ST, KINGS, 11235',
         'BRIGHTON7 ST, KINGS, 11235': 'BRIGHTON 7 ST, KINGS, 11235',
         'BRIGHTON8 CT, KINGS, 11235': 'BRIGHTON 8 CT, KINGS, 11235',
         'BRIGHTONBEACH AVE, KINGS, 11235': 'BRIGHTON BEACH AVE, KINGS, 11235',
         'HARLEM RIVER DR W, NEW YORK, 10032': 'HARLEM RIVER DRWY, NEW YORK, 10033'}

fdny_gas['ADDRESS_W_ZIP'] = np.vectorize(replacement)(preproc, fdny_gas['ADDRESS_W_ZIP'])

In [20]:
fdny_gas.loc[:,'left_street'] = fdny_gas['ADDRESS_W_ZIP'].apply(clean_address)

In [21]:
streets_dissolved.loc[:,'left_street'] = streets_dissolved['left_stree'].apply(clean_address)

In [22]:
replacements = {'DOUGLASS BLVD, NEW YORK, 10039': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10039',
                'GRANDCONCOURSE, BRONX, 10458': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10039',
                'AVED, NEW YORK, 10009': 'AVENUE D, NEW YORK, 10009',
                'F D R DR W, NEW YORK, 10009': 'FDR DR, NEW YORK, 10009',
                'F D R DR W, NEW YORK, 10002': 'FDR DR, NEW YORK, 10002',
                'FDR DR W, NEW YORK, 10009': 'FDR DR, NEW YORK, 10009',
                'FDR DR W, NEW YORK, 10002': 'FDR DR, NEW YORK, 10002',
                'E 116 BLVD, NEW YORK, 10035': 'E 116 ST, NEW YORK, 10035',
                'DOUGLASS BLVD, NEW YORK, 10030': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10030',
                'GASTON-BLVD-STONE AVE, KINGS, 11212': 'MOTHER GASTON BLVD, KINGS, 11212',
                'POWELL BLVD, NEW YORK, 10030': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10030',
                'ST JOHNS PL, KINGS, 11213': 'SAINT JOHNS PL, KINGS, 11213',
                'AVEC, NEW YORK, 10009': 'AVENUE C, NEW YORK, 10009',
                'GRANDCONCOURSE, BRONX, 10456': 'GRAND CONCOURSE, BRONX, 10456',
                'GRANDCONCOURSE, BRONX, 10453': 'GRAND CONCOURSE, BRONX, 10453',
                'GRANDCONCOURSE, BRONX, 10452': 'GRAND CONCOURSE, BRONX, 10452',
                'GRANDCONCOURSE, BRONX, 10451': 'GRAND CONCOURSE, BRONX, 10451',
                'GRANDCONCOURSE, BRONX, 10457': 'GRAND CONCOURSE, BRONX, 10457',
                'BELT PKWY N, KINGS, 11236': 'BELT PKWY, KINGS, 11236',
                'BELT PKWY N, KINGS, 11214': 'BELT PKWY, KINGS, 11214',
                'BELT PKWY S, KINGS, 11214': 'BELT PKWY, KINGS, 11214',
                'BELT PKWY N, KINGS, 11235': 'BELT PKWY, KINGS, 11235',
                'BELT PKWY S, KINGS, 11235': 'BELT PKWY, KINGS, 11235',
                'BELT PKWY N, KINGS, 11228': 'BELT PKWY, KINGS, 11228',
                'BELT PKWY S, KINGS, 11228': 'BELT PKWY, KINGS, 11228',
                'AVE W, KINGS, 11229': 'AVENUE W, KINGS, 11229',
                'AVE W, KINGS, 11223': 'AVENUE W, KINGS, 11223',
                'ST MARKS AVE, KINGS, 11213': 'SAINT MARKS AVE, KINGS, 11213',
                'STMARKS AVE, KINGS, 11213': 'SAINT MARKS AVE, KINGS, 11213',
                'ST MARKS AVE, KINGS, 11233': 'SAINT MARKS AVE, KINGS, 11233',
                'ST MARKS AVE, KINGS, 11216': 'SAINT MARKS AVE, KINGS, 11216',
                'ST MARKS AVE, KINGS, 11238': 'SAINT MARKS AVE, KINGS, 11238',
                'ST MARKS AVE, KINGS, 11217': 'SAINT MARKS AVE, KINGS, 11217',
                'MALCOLMX BLVD, NEW YORK, 10030': 'MALCOLM X BLVD, NEW YORK, 10030',
                'LENOX AVE, NEW YORK, 10030': 'MALCOLM X BLVD, NEW YORK, 10030',
                'LENOX AVE, NEW YORK, 10027': 'MALCOLM X BLVD, NEW YORK, 10037',
                'LENOX AVE, NEW YORK, 10027': 'MALCOLM X BLVD, NEW YORK, 10027',
                'UNIVERSITY AVE, BRONX, 10452': 'DR MARTIN LUTHER KING JR BLVD, BRONX, 10452',
                'UNIVERSITY AVE, BRONX, 10453': 'DR MARTIN LUTHER KING JR BLVD, BRONX, 10452',
                'W 91 ST, NEW YORK, 10025': 'W 91 ST, NEW YORK, 10024',
                'DOUGLASS BLVD, NEW YORK, 10027': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10027',
                'DOUGLASS BLVD, NEW YORK, 10026': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10026',
                'DOUGLASS BLVD, NEW YORK, 10030': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10030',
                'DOUGLASS BLVD, NEW YORK, 10039': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10039',
                'E 78 ST, NEW YORK, 10021': 'E 78 ST, NEW YORK, 10075',
                'E 77 ST, NEW YORK, 10021': 'E 77 ST, NEW YORK, 10075', 
                '7 AVE, NEW YORK, 10039': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10039',
                '7 AVE, NEW YORK, 10026': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10026',
                '7 AVE, NEW YORK, 10027': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10027',
                '7 AVE, NEW YORK, 10030': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10030',
                'BRUCKNER BLVD SB, BRONX, 10472': 'BRUCKNER BLVD, BRONX, 10472',
                'E 61 ST, NEW YORK, 10021': 'E 61 ST, NEW YORK, 10065',
                'BEACH 54 ST, QUEENS, 11691': 'BEACH 54 ST, QUEENS, 11692',
                'BEACH 121 ST, QUEENS, 11414': 'BEACH 121 ST, QUEENS, 11694',
                'BEACH 73RD ST, QUEENS, 99999': 'BEACH 73RD ST, QUEENS, 11692'}

In [23]:
fdny_gas['left_street'] = np.vectorize(replacement)(replacements, fdny_gas['left_street'])

In [24]:
fdny_gas.loc[:,'left_street2'] = fdny_gas.loc[:,'left_street']

fdny_gas['left_street'] = fdny_gas['left_street'].str.replace(' ', '')
streets_dissolved['left_street'] = streets_dissolved['left_street'].str.replace(' ', '')

In [25]:
fdny_gas = fdny_gas.merge(streets_dissolved, how='left', on='left_street')

In [26]:
print "Percent geo-associated: %s" % str(round((
            len(fdny_gas[~fdny_gas['geometry'].isnull()])/(len(fdny_gas)*1.)*100.),2))

Percent geo-associated: 92.85


####  Spatial join of street polygon point in census tract

In [64]:
# load fdny gas streets shapefile
if os.path.exists('processed_data/fdny_gas_street_shapes'):
    fdny_shape_path = 'processed_data/fdny_gas_street_shapes'
    print "FDNY Gas Street Shapes file exists at: {}".format(fdny_shape_path)
else:
    fdny_gas_geoms = fdny_gas[~fdny_gas['geometry'].isnull()]
    fdny_gas_gp = gp.GeoDataFrame(fdny_gas_geoms)
    fdny_gas_gp.to_file('processed_data/fdny_gas_street_shapes', driver="ESRI Shapefile")

FDNY Gas Street Shapes file exists at: processed_data/fdny_gas_street_shapes


In [99]:
# index = rtree.index.Index()
# filename1 = fdny_shape_path
# filename2 = NYC_ZIPS

# with fiona.open(filename1, 'r') as layer1:
#     with fiona.open(filename2, 'r') as layer2:

#         index = rtree.index.Index()
#         for feat1 in layer1:
#             fid = int(feat1['id'])
#             geom1 = shapely.geometry.shape(feat1['geometry'])
#             index.insert(fid, geom1.bounds)
            
#         for feat2 in layer2:
#             geom2 = shapely.geometry.shape(feat2['geometry'])
#             for fid in list(index.intersection(geom2.bounds)):
#                 if fid != int(feat2['id']):
#                     feat1 = layer1[fid]
#                     geom1 = shapely.geometry.shape(feat1['geometry'])
#                     if geom1.intersects(geom2):
#                         print '{} intersects {}'.format(feat2['id'],feat1['id'])

In [101]:
test = gp.GeoDataFrame(fdny_gas_geoms)

In [102]:
test.columns

Index([   u'IM_INCIDENT_KEY', u'INCIDENT_TYPE_DESC',     u'STREET_HIGHWAY',
                 u'ZIP_CODE',       u'BOROUGH_DESC',      u'ADDRESS_W_ZIP',
            u'incident_code',        u'left_street',       u'left_street2',
                 u'geometry',         u'left_stree'],
      dtype='object')

In [None]:
filename1 = fdny_shape_path
filename2 = NYC_TRACTS

with fiona.open(filename1, 'r') as streets:
    with fiona.open(filename2, 'r') as tracts:
        
        for st in streets:
            print st['geometry']

{'type': 'MultiLineString', 'coordinates': [[(591448.3999999999, 4512440.51), (591294.3500000001, 4512184.58)], [(591607.3300000001, 4512702.9399999995), (591448.3999999999, 4512440.51)], [(591639.72, 4512759.52), (591607.3300000001, 4512702.9399999995)], [(591729.8200000001, 4512907.52), (591718.1499999999, 4512888.9399999995)], [(591674.47, 4512816.79), (591639.72, 4512759.52)], [(591718.1499999999, 4512888.9399999995), (591674.47, 4512816.79)], [(591814.46, 4513046.68), (591806.25, 4513034.8100000005)], [(592148.44, 4513600.4), (592144.8200000001, 4513592.8100000005)], [(592144.8200000001, 4513592.8100000005), (592128.0, 4513567.84)], [(592128.0, 4513567.84), (592051.77, 4513445.88)], [(592158.27, 4513617.51), (592154.27, 4513613.6899999995)], [(592151.27, 4513610.77), (592148.44, 4513600.4)], [(592154.27, 4513613.6899999995), (592151.27, 4513610.77)], [(591886.81, 4513168.71), (591814.46, 4513046.68)], [(592051.77, 4513445.88), (591886.81, 4513168.71)], [(591758.44, 4512955.0600000

In [None]:
for feat2 in layer2:
                geom2 = shape(feat2['geometry'])
                for fid in list(index.intersection(geom2.bounds)):
                    if fid != int(feat2['id']):
                        feat1 = layer1[fid]
                        geom1 = shape(feat1['geometry'])
                        if geom1.intersects(geom2):
                            print '{} intersects {}'.format(feat2['id'], feat1['id'])

In [101]:
# spatial join of street polygons with census tracts: 
# by finding a 'representative point' on the street polygon (line), which is
# a central point that lies on the street polygon, and joining with the census tract
# polygon within which that point lies

with fiona.open(NYC_TRACTS, 'r') as tracts:
        index = rtree.index.Index()
        street_id = fdny_gas.index.values
        street_geom = fdny_gas.geometry.values
        features = {} # cache the features so you only have to query the dataframe once
        
        for i in range(0,len(street_id)):
            fid = int(street_id[i])
            geom1 = shape(street_geom[i].representative_point())
            index.insert(fid, geom1.bounds)
            features[fid] = (feat1, geom1,)

        for feat2 in layer2:
            geom2 = shape(feat2['geometry'])
            for fid in list(index.intersection(geom2.bounds)):
                if fid != int(feat2['id']):
                    feat1 = layer1[fid]
                    geom1 = shape(feat1['geometry'])
                    if geom1.intersects(geom2):
                        print '{} intersects {}'.format(feat2['id'], feat1['id'])

Unnamed: 0,IM_INCIDENT_KEY,INCIDENT_TYPE_DESC,STREET_HIGHWAY,ZIP_CODE,BOROUGH_DESC,ADDRESS_W_ZIP,incident_code,left_street,left_street2,geometry,left_stree
0,55672695,412 - Gas leak (natural gas or LPG),43 ST,11103,Queens,"43 ST, QUEENS, 11103",412,"43ST,QUEENS,11103","43 ST, QUEENS, 11103","(LINESTRING (591448.3999999999 4512440.51, 591...","43RD ST, QUEENS, 11103"
1,55673190,412 - Gas leak (natural gas or LPG),LINDEN PL,11354,Queens,"LINDEN PL, QUEENS, 11354",412,"LINDENPL,QUEENS,11354","LINDEN PL, QUEENS, 11354","(LINESTRING (598730.1499999999 4513176.96, 598...","LINDEN PL, QUEENS, 11354"
2,55673240,412 - Gas leak (natural gas or LPG),LELAND AVE,10460,Bronx,"LELAND AVE, BRONX, 10460",412,"LELANDAVE,BRONX,10460","LELAND AVE, BRONX, 10460","(LINESTRING (595778.52 4521017.7, 595779.93999...","LELAND AVE, BRONX, 10460"
3,55673374,412 - Gas leak (natural gas or LPG),BRIGHTON7 ST,11235,Kings,"BRIGHTON 7 ST, KINGS, 11235",412,"BRIGHTON7ST,KINGS,11235","BRIGHTON 7 ST, KINGS, 11235","(LINESTRING (587973.8899999999 4492398.36, 587...","BRIGHTON 7TH ST, KINGS, 11235"
4,55673442,412 - Gas leak (natural gas or LPG),216 ST,11361,Queens,"216 ST, QUEENS, 11361",412,"216ST,QUEENS,11361","216 ST, QUEENS, 11361","(LINESTRING (604245.4199999999 4512660.75, 604...","216TH ST, QUEENS, 11361"


#### Preprocess and scale PLUTO attributes

In [15]:
# convert PLUTO zip to string
master_pluto['ZipCode'] = master_pluto['ZipCode'].astype(str)

In [16]:
# calculate building age
def year_calc(data):
    if (data < 1800) | (data > 2016):
        return float('NaN')
    else:
        return 2017-data

master_pluto['age'] = master_pluto.YearBuilt.apply(lambda x: year_calc(x))

In [17]:
def scale_and_group_zip(data,zip_col_name,field,multiple=True,
                        header_prefix=None,dispose=False,og=False):
    '''Creates a general "group by" and scaling function that:
    - groups data for a given variable category in an input dataframe by zip code
    - then creates a ratio of each variable category in the zip for all values in zip
    - produces a new sparse matrix with rows=zip codes and cols=each category of the variable,
    where the values are the ratio of the category / all instances in the zip
    '''
    # group selected variable by zip code
    if multiple:
        if og:
            data2 = data.copy()
            data2[field] = data2[field].fillna('NA')
            data = data2.copy()
        
        temp_df = data.groupby([zip_col_name,field])[
            field].count().unstack(level=-1).reset_index()
        
        # create df of ratio of select category of variable per all instances in zip
        zip_matrix = pd.DataFrame()

        for i in range(len(temp_df[zip_col_name])):
            zip_matrix[str(
                temp_df[zip_col_name][i])] = temp_df.T[i][1:]/temp_df.T[i][1:].sum()
    
        zip_matrix = zip_matrix.T.reset_index()

        zip_matrix['ZipCode'] = zip_matrix['index'].astype(str)
        if og:
            zip_matrix = zip_matrix.drop(['index','NA'],axis=1)
        else:
            zip_matrix = zip_matrix.drop('index',axis=1)
        
        if dispose:
            zip_matrix = zip_matrix.rename(columns={zip_matrix.columns[0]:'No_disposition'})
            
        # catch decimal zipcodes and strip
        if len(zip_matrix.ZipCode[10])>5:
            zip_matrix['ZipCode'] = zip_matrix['ZipCode'].apply(lambda x: x[:-2])

        
        # update header to specific source data (for less confusion when merging data later)
        if header_prefix:
            new_columns = []
            for col in zip_matrix.columns:
                if col != 'ZipCode':
                    new_columns.append(header_prefix+col)
                else:
                    new_columns.append(col)
            zip_matrix.columns = new_columns
            
        return zip_matrix       
    
    else:
    
        return pd.DataFrame((data.groupby(zip_col_name)[field].sum())/data.groupby(
                zip_col_name)[field].count()).reset_index()

In [18]:
# groupby zip - average age
avg_bldg_age_by_zip = scale_and_group_zip(master_pluto,'ZipCode','age',multiple=False)

In [19]:
# groupby zip - # building class in zip / total building in zip
bldgclass_by_zip = scale_and_group_zip(master_pluto,'ZipCode','BldgClass',
                                       multiple=True,header_prefix='bldg_class_')

In [20]:
# groupby zip - # landuse in zip / total land uses in zip
landuse_by_zip = scale_and_group_zip(master_pluto,'ZipCode','LandUse',
                                       multiple=True,header_prefix='landuse_')

In [21]:
# the following assorted pluto attributes are aggregated by zip
pluto_attrib_by_zip = pd.DataFrame()

# function to create ratio of given PLUTO category per zip code
def pluto_attributes_zip(data,zip_col_name,oldfield,newfield,denominator='BldgArea'):
    pluto_attrib_by_zip[newfield] = data.groupby(
        zip_col_name)[oldfield].sum()*1.0/data.groupby(zip_col_name)[denominator].sum()

In [22]:
# commercial ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','ComArea','com_ratio',denominator='BldgArea')

# residential ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','ResArea','res_ratio',denominator='BldgArea')

# office ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','OfficeArea','office_ratio',denominator='BldgArea')

# retail ratio by zip code
pluto_attributes_zip(master_pluto,'ZipCode','RetailArea','retail_ratio',denominator='BldgArea')

# res / total units by zip code
pluto_attributes_zip(master_pluto,'ZipCode','UnitsRes','res_unit_ratio',denominator='UnitsTotal')

# mean unit area by zip code
pluto_attributes_zip(master_pluto,'ZipCode','BldgArea','unit_area',denominator='UnitsTotal')

# assessed value per sq foot
pluto_attributes_zip(master_pluto,'ZipCode','AssessTot','value_per_ft',denominator='LotArea')

In [23]:
# total units by zip code
pluto_attrib_by_zip['total_units'] = master_pluto.groupby('ZipCode')['UnitsTotal'].sum()

# reset index
pluto_attrib_by_zip = pluto_attrib_by_zip.reset_index()

#### Preprocess and scale the DOB and ECB data

In [24]:
# merge BIN with zipcodes pulled from another dataset
pad = pd.read_csv(PAD,usecols=['bin','zipcode'])
# zip to int
def zipint(data):
    try:
        return str(int(data))
    except ValueError:
        return float('NaN')
pad['zipcode'] = pad.zipcode.apply(lambda x: zipint(x))
pad = pad.rename(columns={'bin':'BIN'})
pad = pad[~pad.zipcode.isnull()]

In [25]:
# merge zip to dob/ecb datasets
dob_complaints = dob_complaints.merge(pad,how='left',on='BIN')
dob_violations = dob_violations.merge(pad,how='left',on='BIN')
ecb = ecb.merge(pad,how='left',on='BIN')

Slice DOB data to include only data before 12/31/2015, same as FDNY data for now, we'll consider issuances before 1/1/2013 to be relevant.

In [26]:
# convert date string to datetime
dob_complaints['date_entered'] = dob_complaints['Date Entered'].apply(
    lambda x: datetime.datetime.strptime(x,'%m/%d/%Y'))

In [27]:
permits['issuance_date'] = permits['Issuance Date'].apply(
    lambda x: datetime.datetime.strptime(x,'%m/%d/%Y'))

In [None]:
# deal with inconsistent datetime
def dob_date(data):
    try:
        return datetime.datetime.strptime(data, '%Y%m%d')
    except:
        try:
            y,md = data.split('  ')
            if y in ['11','12','13','14','15']:
                data = '20'+y+md
                return datetime.datetime.strptime(data, '%Y%m%d')
        except:
            try:
                data = str(data)[:8]
                return datetime.datetime.strptime(data, '%Y%m%d')
            except:
                return float("NaN")

dob_violations['issue_date'] = dob_violations['ISSUE_DATE'].apply(
    lambda x: dob_date(x))

# cut out the approx 50 records with incoherent date format
dob_violations = dob_violations[~dob_violations['issue_date'].isnull()]

In [None]:
# deal with inconsistent datetime
def ecb_date(data):
    try:
        return datetime.datetime.strptime(str(data), '%Y%m%d')
    except:
        return float("NaN")

ecb['issue_date'] = ecb['ISSUE_DATE'].apply(
    lambda x: ecb_date(x))

# cut out the 85 records with incoherent date format
ecb = ecb[~ecb['issue_date'].isnull()]

In [None]:
# truncate dates after 12/31/2015, but keeping dates prior
dob_complaints = dob_complaints[dob_complaints.date_entered<'2016-01-01']

dob_violations = dob_violations[dob_violations.issue_date<datetime.datetime(2016,1,1,0,0)]

ecb = ecb[ecb.issue_date<datetime.datetime(2016,1,1,0,0)]

permits = permits[permits.issuance_date<datetime.datetime(2016,1,1,0,0)]

Pre-process DOB/ECB data as with the PLUTO data (group by zip and scale)

In [None]:
# same as above, group various DOB / ECB datasets to zip and scale

# groupby zip - # complaint category in zip / total complaints in zip
complaints_by_zip = scale_and_group_zip(dob_complaints,'zipcode','Complaint Category',
                                       multiple=True,header_prefix='DOB_complaint_')

# groupby zip - # disposition code in zip / total dispositions in zip
disposition_by_zip = scale_and_group_zip(dob_complaints,'zipcode','Disposition Code',
                                       multiple=True,header_prefix='DOB_dispos_',dispose=True)

# groupby zip - # dob violations type in zip / total violations in zip
violations_by_zip = scale_and_group_zip(dob_violations,'zipcode','VIOLATION_TYPE',
                                       multiple=True,header_prefix='DOB_violation_')

# groupby zip - # ecb violation type in zip / total violations in zip
ecb_violations_by_zip = scale_and_group_zip(ecb,'zipcode','VIOLATION_TYPE',
                                       multiple=True,header_prefix='ECB_violation_')

# groupby zip - # ecb violation type in zip / total violations in zip
ecb_infractions_by_zip = scale_and_group_zip(ecb,'zipcode','INFRACTION_CODE1',
                                       multiple=True,header_prefix='ECB_infraction_')

# groupby zip - # permit type in zip / total permits in zip
permit_by_zip = scale_and_group_zip(permits,'Zip Code','Permit Type',
                                       multiple=True,header_prefix='DOB_permit_')

# groupby zip - # oil or gas permits in zip / total permits in zip
oil_gas_permit_by_zip = scale_and_group_zip(permits,'Zip Code','Oil Gas',
                                       multiple=True,header_prefix='DOB_permit_',og=True)

Preprocess the NYC zipcode shapefiles

In [None]:
# remove duplicate zips (just keeping first listed)
index_list = []
for i in nyc_zips.ZIPCODE:
    temp_index = nyc_zips.ZIPCODE[nyc_zips.ZIPCODE==i].index.tolist()

    if len(temp_index)>1:
        index_list += temp_index[1:]

index_list = set(index_list)

zip_t = nyc_zips.T

zip_drop = zip_t.drop(index_list,axis=1)

nyc_zips_set = zip_drop.T

### Merge datasets

In [None]:
# merge gas
fdny_gas_zip['ZipCode'] = fdny_gas_zip['ZIP_CODE'].astype(str)
fdny_gas_zip['target.gas_incidents'] = fdny_gas_zip['IM_INCIDENT_KEY']
fdny_gas_zip_2 = fdny_gas_zip.drop(['ZIP_CODE','IM_INCIDENT_KEY'],axis=1)

Note, since FDNY zips are a subset of PLUTO zips, merging all PLUTO first, then performing left-join of FDNY on PLUTO

In [None]:
# merge PLUTO datasets

# age and building class
merged_pluto = avg_bldg_age_by_zip.merge(bldgclass_by_zip,how='left',on='ZipCode')

# merge land use
merged_pluto = merged_pluto.merge(landuse_by_zip,how='left',on='ZipCode')

# merge remaing PLUTO attributes
merged_pluto = merged_pluto.merge(pluto_attrib_by_zip,how='left',on='ZipCode')

# merge with FDNY
pluto_fdny = merged_pluto.merge(fdny_gas_zip_2, how = 'left',on='ZipCode')

In [None]:
# merge DOB / ECB data

# dob complaints
pluto_fdny_dob = pluto_fdny.merge(complaints_by_zip,how='left',on='ZipCode')

# dob dispositions
pluto_fdny_dob = pluto_fdny_dob.merge(disposition_by_zip,how='left',on='ZipCode')

# dob violations
pluto_fdny_dob = pluto_fdny_dob.merge(violations_by_zip,how='left',on='ZipCode')

# ecb violations
pluto_fdny_dob = pluto_fdny_dob.merge(ecb_violations_by_zip,how='left',on='ZipCode')

# ecb infractions
pluto_fdny_dob = pluto_fdny_dob.merge(ecb_infractions_by_zip,how='left',on='ZipCode')

# dob permit type
pluto_fdny_dob = pluto_fdny_dob.merge(permit_by_zip,how='left',on='ZipCode')

# dob oil or gas permit
pluto_fdny_dob = pluto_fdny_dob.merge(oil_gas_permit_by_zip,how='left',on='ZipCode')

In [None]:
# merge zip shapefiles
nyc_zips_set['ZipCode'] = nyc_zips_set['ZIPCODE'].astype(str)
nyc_zips_clean = nyc_zips_set[['ZipCode','geometry','POPULATION','AREA']].copy()

all_merged = pluto_fdny_dob.merge(nyc_zips_clean,how='left',on='ZipCode')

master_merged = all_merged.set_index('ZipCode',drop=True)

In [None]:
master_merged.to_csv('processed_data/master_merged.csv',index=False)

### OUTPUT: final merged 

#### FDNY
- Merge with zip code shapefile

#### PLUTO features
- Avg building age per zipcode
- Ratio of each building class per zip code
- Ratio of each land use per zip code
- Building use ratio (commercial, residential, office, retail) per zip code
- Residential unit density per zip code
- Ave Unit area per zip code
- Value per ft per zip code
- Total units per zip code

##### DOB/ECB reatures
- Ratio of each DOB complaint type per zip code
- Ratio of each DOB complaint disposition per zip code
- Ratio of each DOB violation type per zip code
- Ratio of each ECB violation type per zip code
- Ratio of each DOB work permit type per zip code
- Ratio of oil or gas permits out of all permits per zip code