<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1">Imports</a></span></li><li><span><a href="#Read-in-Shapefiles" data-toc-modified-id="Read-in-Shapefiles-2">Read in Shapefiles</a></span></li></ul></div>

## Imports

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import Point,Polygon
import pyproj

## Read in Shapefiles

In [3]:
mappluto = gpd.read_file('./Data/nyc_mappluto_18v2_1_shp/MapPLUTO.shp')

In [None]:
mappluto.isna().sum()

In [9]:
ct_shp = gpd.read_file('./nyct2010_19a/nyct2010_19a/nyct2010.shp')

In [None]:
ct_shp.info()

In [None]:
print(f'PLUTO CRS: {mappluto.crs}')
print(f'NYCT CRS: {ct_shp.crs}')

In [None]:
tracts = ct_shp[['BoroName','BoroCode','BoroCT2010','NTAName','NTACode','PUMA','geometry']]

In [None]:
## Since the CRS for these two shapefiles is the same
## I can perform a spatial join on the datasets to get census tracts for all properties
pluto_merged = gpd.sjoin(mappluto,tracts,how = 'left',op ='intersects')

In [None]:
pluto_merged.head(20).T

In [None]:
pluto_merged.columns

In [None]:
pluto_cols_to_keep = ['BoroName','Borough','Block','Lot','BoroCT2010','CT2010','Tract2010','CB2010','ZipCode','Address',
                      'ZoneDist1','LandUse','BldgArea','ComArea','ResArea','OfficeArea',
                      'RetailArea','GarageArea','StrgeArea','FactryArea','OtherArea','NumBldgs',
                      'NumFloors','UnitsRes','UnitsTotal','Ext','ProxCode','BsmtCode',
                      'YearBuilt','BBL','BoroCode_left','NTAName','NTACode',
                      'XCoord','YCoord','Shape_Leng','Shape_Area','geometry']

In [None]:
pluto_gdf = pluto_merged[pluto_cols_to_keep]

In [None]:
pluto_gdf.isna().sum()

In [None]:
pluto_gdf[['BoroCode_left','BoroCT2010','CT2010','Tract2010']][pluto_gdf['BoroCT2010'].isna()]

In [None]:
pluto_gdf.info()

In [None]:
# Columns to One-Hot Encode
pluto_cols_to_dummy = ['Ext','ProxCode','BsmtCode','LandUse'] # also 'ZoneDist1', but has too many values

# Columns to convert to numerics
pluto_cols_to_num = ['BldgArea','ComArea','ResArea','OfficeArea','RetailArea',
                     'GarageArea','StrgeArea','FactryArea','OtherArea','NumBldgs',
                     'NumFloors','UnitsRes','UnitsTotal','YearBuilt'] # possibly residfar and builtfar

# Columns to convert to strings
pluto_cols_to_str = ['BoroName','Borough','Block','Lot','BoroCT2010','CT2010','Tract2010',
                     'CB2010','ZipCode','Address','BBL','BoroCode_left','NTAName','NTACode']

In [None]:
for i in pluto_cols_to_dummy:
    print(i,': ',pluto_gdf[i].unique())

In [None]:
# Data Dictionary for PLUTO Categorical Columns
pluto_cat_dict = {
    'LandUse': {
        '1':'One_Two_Fam_Bldg',
        '2':'Multi_Fam_Walkup',
        '3':'Multi_Fam_Elevator',
        '4':'Mixed_Use_Res_Com',
        '5':'Comm_Office_Bldg',
        '6':'Indus_Manuf_Bldg',
        '7':'Transpo_Util',
        '8':'Pub_Facil_Instit',
        '9':'Open_Space',
        '10':'Parking_Garage',
        '11':'Vacant',
        np.nan:'Not_Provided'
    },
    'Ext': {
        'EG':'Exten_Garage',
        'G':'Garage',
        'E':'Extension',
        np.nan:'No_Ext_Gar'
    },
    'BsmtCode':{
        '0':'No_Bsmt',
        '1':'Above_Gr_Full_Bsmt',
        '2':'Below_Gr_Full_Bsmt',
        '3':'Above_Gr_Part_Bsmt',
        '4':'Below_Gr_Part_Bsmt',
        '5':'Unknown',
        np.nan:'Not_Provided'
    },
    'ProxCode':{
        '0':'NA',
        '1':'Detached',
        '2':'Semi_Attached',
        '3':'Attached',
        np.nan:'Not_Provided'
    }
}

In [None]:
pluto_gdf.replace(pluto_cat_dict, inplace = True)

In [None]:
pluto_gdf[pluto_cols_to_dummy].head()

In [None]:
# Convert Numeric Columns to Float
for i in pluto_cols_to_num:
    print(f'Before, {i}: {pluto_gdf[i].dtype}')
    pluto_gdf[i] = pluto_gdf[i].astype(float)
    print(f'After, {i}: {pluto_gdf[i].dtype}')

In [None]:
# Convert String Columns to String
for i in pluto_cols_to_str:
    print(f'Before, {i}: {pluto_gdf[i].dtype}')
    pluto_gdf[i] = pluto_gdf[i].astype(str)
    print(f'After, {i}: {pluto_gdf[i].dtype}')

In [None]:
na_tracts_index = pluto_gdf[(pluto_gdf['BoroCT2010']=='nan') & (pluto_gdf['Tract2010']=='None')].index

In [None]:
na_tracts_index

In [None]:
pluto_gdf = pluto_gdf.drop(index = na_tracts_index)

In [None]:
pluto_gdf.reset_index(inplace=True,drop = True)

In [None]:
pluto_gdf[pluto_gdf['BoroCT2010']=='nan']

In [None]:
#pluto_gdf['BoroCT2010'][pluto_gdf['BoroCT2010']=='nan'] = 
tract_for_nas = pluto_gdf[pluto_gdf['BoroCT2010']=='nan']['Tract2010'].apply(lambda x: x+'00' if len(x) == 4 else x)
borocode_for_nas = pluto_gdf.loc[tract_for_nas.index]['BoroCode_left']
boro_tract = borocode_for_nas+tract_for_nas

In [None]:
pluto_gdf['BoroCT2010'].loc[boro_tract.index] = boro_tract

In [None]:
pluto_gdf[pluto_gdf['BoroCT2010']=='nan']

In [152]:
pluto_dummies = pd.get_dummies(pluto_gdf,columns=pluto_cols_to_dummy)

In [153]:
pluto_dummies.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 859443 entries, 0 to 859442
Data columns (total 62 columns):
BoroName                       859443 non-null object
Borough                        859443 non-null object
Block                          859443 non-null object
Lot                            859443 non-null object
BoroCT2010                     859443 non-null object
CT2010                         859443 non-null object
Tract2010                      859443 non-null object
CB2010                         859443 non-null object
ZipCode                        859443 non-null object
Address                        859443 non-null object
ZoneDist1                      858866 non-null object
BldgArea                       859443 non-null float64
ComArea                        859443 non-null float64
ResArea                        859443 non-null float64
OfficeArea                     859443 non-null float64
RetailArea                     859443 non-null float64
GarageArea  

In [154]:
pluto_dummies.columns

Index(['BoroName', 'Borough', 'Block', 'Lot', 'BoroCT2010', 'CT2010',
       'Tract2010', 'CB2010', 'ZipCode', 'Address', 'ZoneDist1', 'BldgArea',
       'ComArea', 'ResArea', 'OfficeArea', 'RetailArea', 'GarageArea',
       'StrgeArea', 'FactryArea', 'OtherArea', 'NumBldgs', 'NumFloors',
       'UnitsRes', 'UnitsTotal', 'YearBuilt', 'BBL', 'BoroCode_left',
       'NTAName', 'NTACode', 'XCoord', 'YCoord', 'Shape_Leng', 'Shape_Area',
       'geometry', 'Ext_Exten_Garage', 'Ext_Extension', 'Ext_Garage',
       'Ext_No_Ext_Gar', 'ProxCode_Attached', 'ProxCode_Detached',
       'ProxCode_NA', 'ProxCode_Not_Provided', 'ProxCode_Semi_Attached',
       'BsmtCode_Above_Gr_Full_Bsmt', 'BsmtCode_Above_Gr_Part_Bsmt',
       'BsmtCode_Below_Gr_Full_Bsmt', 'BsmtCode_Below_Gr_Part_Bsmt',
       'BsmtCode_No_Bsmt', 'BsmtCode_Not_Provided', 'BsmtCode_Unknown',
       'LandUse_01', 'LandUse_02', 'LandUse_03', 'LandUse_04', 'LandUse_05',
       'LandUse_06', 'LandUse_07', 'LandUse_08', 'LandUse_09',
   

In [156]:
pluto_dummies['YearBuilt'] = [year if year != 0 else np.nan for year in pluto_dummies['YearBuilt']]

In [159]:
pluto_dummies['YearBuilt'] = pluto_dummies['YearBuilt'].fillna(pluto_dummies.groupby(['BoroCT2010'])['YearBuilt'].transform('mean'))

In [161]:
pluto_dummies['YearBuilt'].isna().sum()

13

In [162]:
pluto_dummies = pluto_dummies[pluto_dummies['YearBuilt'].notna()]
pluto_dummies.reset_index(inplace = True, drop = True)
pluto_dummies.head()

Unnamed: 0,BoroName,Borough,Block,Lot,BoroCT2010,CT2010,Tract2010,CB2010,ZipCode,Address,...,LandUse_03,LandUse_04,LandUse_05,LandUse_06,LandUse_07,LandUse_08,LandUse_09,LandUse_Not_Provided,LandUse_Parking_Garage,LandUse_Vacant
0,Manhattan,MN,1,10,1000500,5,5,1022,10004,1 GOVERNORS ISLAND,...,0,0,0,0,0,1,0,0,0,0
1,Manhattan,MN,1,101,1000100,1,1,1001,10004,1 LIBERTY ISLAND,...,0,0,0,0,0,1,0,0,0,0
2,Manhattan,MN,1,201,1000100,1,1,1000,10004,1 ELLIS ISLAND,...,0,0,0,0,0,0,0,1,0,0
3,Manhattan,MN,2,1,1000900,9,9,1025,10004,4 SOUTH STREET,...,0,0,0,0,1,0,0,0,0,0
4,Manhattan,MN,2,2,1000900,9,9,1025,10004,10 SOUTH STREET,...,0,0,0,0,1,0,0,0,0,0


In [168]:
pluto_cols_to_sum = ['BldgArea','NumBldgs','UnitsRes','UnitsTotal','Ext_Exten_Garage',
                     'Ext_Extension', 'Ext_Garage','Ext_No_Ext_Gar', 'ProxCode_Attached',
                     'ProxCode_Detached','ProxCode_NA', 'ProxCode_Not_Provided',
                     'ProxCode_Semi_Attached','BsmtCode_Above_Gr_Full_Bsmt',
                     'BsmtCode_Above_Gr_Part_Bsmt','BsmtCode_Below_Gr_Full_Bsmt',
                     'BsmtCode_Below_Gr_Part_Bsmt','BsmtCode_No_Bsmt','BsmtCode_Not_Provided',
                     'BsmtCode_Unknown','LandUse_01', 'LandUse_02', 'LandUse_03', 'LandUse_04',
                     'LandUse_05','LandUse_06', 'LandUse_07', 'LandUse_08', 'LandUse_09',
                     'LandUse_Not_Provided', 'LandUse_Parking_Garage', 'LandUse_Vacant',
                     'ComArea', 'ResArea', 'OfficeArea', 'RetailArea',
                     'GarageArea', 'StrgeArea', 'FactryArea', 'OtherArea']

pluto_cols_to_ratio = ['ComArea', 'ResArea', 'OfficeArea', 'RetailArea',
                      'GarageArea', 'StrgeArea', 'FactryArea', 'OtherArea']

pluto_cols_to_avg = ['NumFloors','YearBuilt']

In [169]:
def column_calc(data,columns_to_calc,groupby_col,calc):
    
    df = data
    gb = groupby_col
    cols = columns_to_calc+gb
    
    if calc == 'mean':
        group_df = df[cols].groupby(by = gb,as_index = False).mean()
        group_df.rename(columns = {col:'avg_'+col for col in columns_to_calc},inplace = True)
    elif calc == 'median':
        group_df = df[cols].groupby(by = gb,as_index = False).median()
        group_df.rename(columns = {col:'med_'+col for col in columns_to_calc},inplace = True)
    elif calc == 'sum':
        group_df = df[cols].groupby(by = gb,as_index = False).sum()
        group_df.rename(columns = {col:'tot_'+col for col in columns_to_calc},inplace = True)
    elif calc == 'count':
        group_df = df[cols].groupby(by = gb,as_index = False).count()
        group_df.rename(columns = {col:'count_'+col for col in columns_to_calc},inplace = True)
    
    return(group_df)

In [170]:
avg_cols = column_calc(data = pluto_dummies, columns_to_calc = pluto_cols_to_avg,
            groupby_col=['BoroCT2010'],calc = 'mean')

In [171]:
avg_cols.head()

Unnamed: 0,BoroCT2010,avg_NumFloors,avg_YearBuilt
0,1000100,0.0,1900.0
1,1000201,4.780488,1912.138889
2,1000202,4.915179,1928.875
3,1000500,1.2,1923.0
4,1000600,5.612121,1926.380645


In [172]:
sum_cols = column_calc(data = pluto_dummies, columns_to_calc = pluto_cols_to_sum,
                      groupby_col = ['BoroCT2010'], calc = 'sum')

In [173]:
sum_cols.head()

Unnamed: 0,BoroCT2010,tot_BldgArea,tot_NumBldgs,tot_UnitsRes,tot_UnitsTotal,tot_Ext_Exten_Garage,tot_Ext_Extension,tot_Ext_Garage,tot_Ext_No_Ext_Gar,tot_ProxCode_Attached,...,tot_LandUse_Parking_Garage,tot_LandUse_Vacant,tot_ComArea,tot_ResArea,tot_OfficeArea,tot_RetailArea,tot_GarageArea,tot_StrgeArea,tot_FactryArea,tot_OtherArea
0,1000100,1145016.0,24.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,1145016.0,0.0,0.0,0.0,0.0,0.0,0.0,1145016.0
1,1000201,1592632.0,43.0,1055.0,1089.0,0.0,17.0,0.0,24.0,24.0,...,3.0,1.0,692133.0,900498.0,1700.0,17118.0,0.0,0.0,0.0,673315.0
2,1000202,4111815.0,92.0,3568.0,3638.0,0.0,8.0,1.0,47.0,12.0,...,0.0,0.0,993682.0,2733569.0,33685.0,54640.0,9860.0,1500.0,0.0,891104.0
3,1000500,5721187.0,194.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,5721187.0,0.0,21146.0,4500.0,0.0,0.0,0.0,5695541.0
4,1000600,6327663.0,189.0,6051.0,6299.0,0.0,28.0,0.0,137.0,90.0,...,2.0,2.0,1693672.0,4637698.0,55974.0,233245.0,34253.0,19933.0,0.0,1214990.0


In [174]:
avg_bldgarea_tract = pluto_dummies[['BoroCT2010','BldgArea']].groupby(['BoroCT2010'], as_index = False).mean()
avg_bldgarea_tract.rename(columns = {'BldgArea':'avg_BldgArea'},inplace = True)
unitarea_tract = pd.merge(sum_cols[['BoroCT2010','tot_UnitsTotal']],avg_bldgarea_tract, on = 'BoroCT2010')
unitarea_tract['avg_UnitArea'] = unitarea_tract.apply(lambda row: 0 if float(row['tot_UnitsTotal']) == 0 else row['avg_BldgArea'] / float(row['tot_UnitsTotal']), axis=1)

In [175]:
unitarea_tract.head()

Unnamed: 0,BoroCT2010,tot_UnitsTotal,avg_BldgArea,avg_UnitArea
0,1000100,0.0,572508.0,0.0
1,1000201,1089.0,38844.68,35.670049
2,1000202,3638.0,73425.27,20.182866
3,1000500,5.0,1144237.0,228847.48
4,1000600,6299.0,38349.47,6.088184


In [176]:
#add ratio columns to sum_cols dataframe
for col in pluto_cols_to_ratio:
    sum_cols[f'ratio_{col}'] = sum_cols[['BoroCT2010',f'tot_{col}','tot_BldgArea']].apply(
            lambda row: 0 if float(row['tot_BldgArea']) == 0 else row[f'tot_{col}'] / float(row['tot_BldgArea']),axis=1)

In [177]:
sum_cols.head().T

Unnamed: 0,0,1,2,3,4
BoroCT2010,1000100.0,1000201.0,1000202.0,1000500.0,1000600.0
tot_BldgArea,1145020.0,1592630.0,4111820.0,5721190.0,6327660.0
tot_NumBldgs,24.0,43.0,92.0,194.0,189.0
tot_UnitsRes,0.0,1055.0,3568.0,0.0,6051.0
tot_UnitsTotal,0.0,1089.0,3638.0,5.0,6299.0
tot_Ext_Exten_Garage,0.0,0.0,0.0,0.0,0.0
tot_Ext_Extension,0.0,17.0,8.0,0.0,28.0
tot_Ext_Garage,0.0,0.0,1.0,0.0,0.0
tot_Ext_No_Ext_Gar,2.0,24.0,47.0,5.0,137.0
tot_ProxCode_Attached,0.0,24.0,12.0,0.0,90.0


In [179]:
## Merge all dataframes with calculated columns
## boro_tract_df is dataframe of merged dfs
sum_avg_df = pd.merge(sum_cols,avg_cols, on = 'BoroCT2010')
boro_tract_df = pd.merge(sum_avg_df,unitarea_tract[['BoroCT2010','avg_BldgArea','avg_UnitArea']], on = 'BoroCT2010')

In [180]:
boro_tract_df.head().T

Unnamed: 0,0,1,2,3,4
BoroCT2010,1000100.0,1000201.0,1000202.0,1000500.0,1000600.0
tot_BldgArea,1145020.0,1592630.0,4111820.0,5721190.0,6327660.0
tot_NumBldgs,24.0,43.0,92.0,194.0,189.0
tot_UnitsRes,0.0,1055.0,3568.0,0.0,6051.0
tot_UnitsTotal,0.0,1089.0,3638.0,5.0,6299.0
tot_Ext_Exten_Garage,0.0,0.0,0.0,0.0,0.0
tot_Ext_Extension,0.0,17.0,8.0,0.0,28.0
tot_Ext_Garage,0.0,0.0,1.0,0.0,0.0
tot_Ext_No_Ext_Gar,2.0,24.0,47.0,5.0,137.0
tot_ProxCode_Attached,0.0,24.0,12.0,0.0,90.0


In [181]:
boro_tract_df.isna().sum()

BoroCT2010                         0
tot_BldgArea                       0
tot_NumBldgs                       0
tot_UnitsRes                       0
tot_UnitsTotal                     0
tot_Ext_Exten_Garage               0
tot_Ext_Extension                  0
tot_Ext_Garage                     0
tot_Ext_No_Ext_Gar                 0
tot_ProxCode_Attached              0
tot_ProxCode_Detached              0
tot_ProxCode_NA                    0
tot_ProxCode_Not_Provided          0
tot_ProxCode_Semi_Attached         0
tot_BsmtCode_Above_Gr_Full_Bsmt    0
tot_BsmtCode_Above_Gr_Part_Bsmt    0
tot_BsmtCode_Below_Gr_Full_Bsmt    0
tot_BsmtCode_Below_Gr_Part_Bsmt    0
tot_BsmtCode_No_Bsmt               0
tot_BsmtCode_Not_Provided          0
tot_BsmtCode_Unknown               0
tot_LandUse_01                     0
tot_LandUse_02                     0
tot_LandUse_03                     0
tot_LandUse_04                     0
tot_LandUse_05                     0
tot_LandUse_06                     0
t

In [182]:
boro_tract_df.to_csv('./census_tract_pluto.csv',index = False)

In [6]:
boro_tract_df = pd.read_csv('./census_tract_pluto.csv')

In [12]:
## Make Census Tract column be of type string
boro_tract_df['BoroCT2010'] = boro_tract_df['BoroCT2010'].astype(str)

In [14]:
## Unique Census Tracts in boro_tract_df
len(boro_tract_df['BoroCT2010'].unique())

2159

In [15]:
## Unique Census Tracts in ct_shp
len(ct_shp['BoroCT2010'].unique())

2166

In [16]:
## Tracts in ct_shp that are not in boro_tract_df
in_ct_not_pluto = set(ct_shp['BoroCT2010'])-set(boro_tract_df['BoroCT2010'])
in_ct_not_pluto

{'1014300', '3070203', '4009900', '4065501', '4099900', '5008900', '5990100'}

In [17]:
## Tracts in boro_tract_df that are not in ct_shp
in_pluto_not_ct = set(boro_tract_df['BoroCT2010'])-set(ct_shp['BoroCT2010'])
in_pluto_not_ct

set()

In [18]:
## Check tracts in ct_shp that are not in boro_tract_df
ct_shp[ct_shp['BoroCT2010'].isin(in_ct_not_pluto)]

Unnamed: 0,CTLabel,BoroCode,BoroName,CT2010,BoroCT2010,CDEligibil,NTACode,NTAName,PUMA,Shape_Leng,Shape_Area,geometry
1126,999.0,4,Queens,99900,4099900,I,QN47,Ft. Totten-Bay Terrace-Clearview,4103,11991.019878,6627584.0,"POLYGON ((1045849.478027344 229592.6734008789,..."
1335,99.0,4,Queens,9900,4009900,I,QN99,park-cemetery-etc-Queens,4101,8559.817271,2682960.0,"POLYGON ((1006968.779418945 223920.224609375, ..."
1381,702.03,3,Brooklyn,70203,3070203,I,BK99,park-cemetery-etc-Brooklyn,4009,146017.760623,43216570.0,(POLYGON ((1021176.479003906 151374.7969970703...
1536,89.0,5,Staten Island,8900,5008900,I,SI22,West New Brighton-New Brighton-St. George,3903,168.524337,581.9571,"POLYGON ((956043.3608398438 174428.1091918945,..."
1560,655.01,4,Queens,65501,4065501,I,QN21,Middle Village,4110,11694.156834,8754118.0,"POLYGON ((1022222.239196777 200786.5441894531,..."
1622,9901.0,5,Staten Island,990100,5990100,I,SI99,park-cemetery-etc-Staten Island,3901,4743.128085,635702.0,(POLYGON ((970217.0223999023 145643.3322143555...
2053,143.0,1,Manhattan,14300,1014300,I,MN99,park-cemetery-etc-Manhattan,3806,32721.411972,38312330.0,"POLYGON ((997412.2476196289 230101.8157958984,..."


In [19]:
boro_tract_df.shape

(2159, 53)

In [3]:
## Read in dataset of fire_incidents with census tracts
tract_fires = pd.read_csv('./census_tracts_fires.csv')
tract_fires.head()

Unnamed: 0,action_taken1_desc,action_taken2_desc,action_taken3_desc,aes_presence_desc,arrival_date_time,borough_desc,co_detector_present_desc,detector_presence_desc,fire_box,fire_origin_below_grade_flag,...,geometry,x,y,index_right,BoroName,BoroCode,BoroCT2010,NTAName,NTACode,PUMA
0,11 - Extinguishment by fire service personnel,51 - Ventilate,,,2018-06-30T23:58:53.000,2 - Bronx,,,2135.0,,...,POINT (1006187.164915165 233275.6845102735),1006187.0,233275.68451,2032.0,Bronx,2,2002500,Mott Haven-Port Morris,BX39,3710
1,11 - Extinguishment by fire service personnel,51 - Ventilate,64 - Shut down system,,2018-06-30T22:11:36.000,1 - Manhattan,,,1271.0,,...,POINT (993402.8159240253 230290.3614200134),993402.8,230290.36142,1943.0,Manhattan,1,1019100,Upper West Side,MN12,3806
2,21 - Search,51 - Ventilate,64 - Shut down system,,2018-06-30T20:45:56.000,1 - Manhattan,,,1191.0,,...,POINT (997778.1591829141 223509.9802548879),997778.2,223509.980255,1628.0,Manhattan,1,1014602,Yorkville,MN32,3805
3,21 - Search,51 - Ventilate,64 - Shut down system,,2018-06-30T20:08:55.000,1 - Manhattan,,,559.0,,...,POINT (984787.7832632038 208101.0833157133),984787.8,208101.083316,2049.0,Manhattan,1,1007100,West Village,MN23,3810
4,11 - Extinguishment by fire service personnel,,,1 - Present,2018-06-30T18:24:00.000,1 - Manhattan,,1 - Present,810.0,,...,POINT (990393.5943648223 213211.0516181875),990393.6,213211.051618,2157.0,Manhattan,1,1008000,Murray Hill-Kips Bay,MN20,3808


In [20]:
## Make tract column be of type string
tract_fires['BoroCT2010']=tract_fires['BoroCT2010'].astype(str)

In [21]:
## Tracts in the fire dataset, not in boro_tract_df
## these tracts can be dropped
tracts_to_drop = list(set(tract_fires['BoroCT2010'])-set(boro_tract_df['BoroCT2010']))
tracts_to_drop

['4065501', '4009900', '1014300', '4099900']

These census tracts, which are in the fire dataset, but are not present in the mappluto shapefile, correspond to Fort Totten (4099900), Astoria Park (4009900), Central Park (1014300), and St. John Cemetary (4065501).

In [22]:
## drop rows where census tracts are one of the four above in tracts_to_drop
tract_fires = tract_fires[tract_fires['BoroCT2010'].isin(tracts_to_drop) == False]

In [23]:
tract_fires.head()

Unnamed: 0,action_taken1_desc,action_taken2_desc,action_taken3_desc,aes_presence_desc,arrival_date_time,borough_desc,co_detector_present_desc,detector_presence_desc,fire_box,fire_origin_below_grade_flag,...,geometry,x,y,index_right,BoroName,BoroCode,BoroCT2010,NTAName,NTACode,PUMA
0,11 - Extinguishment by fire service personnel,51 - Ventilate,,,2018-06-30T23:58:53.000,2 - Bronx,,,2135.0,,...,POINT (1006187.164915165 233275.6845102735),1006187.0,233275.68451,2032.0,Bronx,2,2002500,Mott Haven-Port Morris,BX39,3710
1,11 - Extinguishment by fire service personnel,51 - Ventilate,64 - Shut down system,,2018-06-30T22:11:36.000,1 - Manhattan,,,1271.0,,...,POINT (993402.8159240253 230290.3614200134),993402.8,230290.36142,1943.0,Manhattan,1,1019100,Upper West Side,MN12,3806
2,21 - Search,51 - Ventilate,64 - Shut down system,,2018-06-30T20:45:56.000,1 - Manhattan,,,1191.0,,...,POINT (997778.1591829141 223509.9802548879),997778.2,223509.980255,1628.0,Manhattan,1,1014602,Yorkville,MN32,3805
3,21 - Search,51 - Ventilate,64 - Shut down system,,2018-06-30T20:08:55.000,1 - Manhattan,,,559.0,,...,POINT (984787.7832632038 208101.0833157133),984787.8,208101.083316,2049.0,Manhattan,1,1007100,West Village,MN23,3810
4,11 - Extinguishment by fire service personnel,,,1 - Present,2018-06-30T18:24:00.000,1 - Manhattan,,1 - Present,810.0,,...,POINT (990393.5943648223 213211.0516181875),990393.6,213211.051618,2157.0,Manhattan,1,1008000,Murray Hill-Kips Bay,MN20,3808


In [24]:
tract_fires['incident_date_time'] = pd.to_datetime(tract_fires['incident_date_time'])
tract_fires['incident_month'] = [i.month for i in tract_fires['incident_date_time']]
tract_fires['incident_year'] = [i.year for i in tract_fires['incident_date_time']]

In [25]:
fire_cols_to_group = ['BoroCT2010','incident_year','incident_month']
grouped_tract_fires = tract_fires[fire_cols_to_group].groupby(by = ['BoroCT2010','incident_year','incident_month']).size().reset_index(name = 'incident_count')
grouped_tract_fires.head()

Unnamed: 0,BoroCT2010,incident_year,incident_month,incident_count
0,1000201,2014,6,2
1,1000201,2015,2,1
2,1000201,2015,11,1
3,1000201,2016,7,1
4,1000201,2017,11,1


In [26]:
year_range = range(tract_fires['incident_year'].min(),tract_fires['incident_year'].max()+1)
month_range = range(1,13)

In [27]:
date_tract_dict = {(yr,mon,tract):0 for yr in year_range for mon in month_range for tract in boro_tract_df['BoroCT2010']}

In [28]:
for i,row in grouped_tract_fires.iterrows():
    date_tract_dict[(row['incident_year'],row['incident_month'],row['BoroCT2010'])] = row['incident_count'] 

In [29]:
date_tract_fires = pd.DataFrame(columns = ['year','month','tract','count'])
date_tract_fires = pd.DataFrame([[i[0],i[1],i[2],date_tract_dict[i]] for i in date_tract_dict],columns = ['year','month','tract','count'])

In [30]:
date_tract_fires.sort_values(by = ['year','month','tract'],inplace = True)
date_tract_fires.reset_index(inplace = True, drop = True)
date_tract_fires.head()

Unnamed: 0,year,month,tract,count
0,2013,1,1000100,0
1,2013,1,1000201,0
2,2013,1,1000202,1
3,2013,1,1000500,0
4,2013,1,1000600,0


In [32]:
date_tract_fires.shape

(155448, 4)

In [33]:
year_tract_fires = date_tract_fires[['year','tract','count']].groupby(['year','tract'],as_index = False).sum()
year_tract_fires

Unnamed: 0,year,tract,count
0,2013,1000100,0
1,2013,1000201,0
2,2013,1000202,5
3,2013,1000500,0
4,2013,1000600,14
5,2013,1000700,5
6,2013,1000800,4
7,2013,1000900,4
8,2013,1001001,0
9,2013,1001002,7


In [34]:
merged_tract_fires = pd.merge(left = date_tract_fires, right = boro_tract_df,
         how = 'left',left_on = 'tract',right_on = 'BoroCT2010')

In [35]:
merged_annual_fires = pd.merge(left = year_tract_fires, right = boro_tract_df,
         how = 'left',left_on = 'tract',right_on = 'BoroCT2010')

In [36]:
print(merged_tract_fires.shape)
merged_tract_fires.head()

(155448, 57)


Unnamed: 0,year,month,tract,count,BoroCT2010,tot_BldgArea,tot_NumBldgs,tot_UnitsRes,tot_UnitsTotal,tot_Ext_Exten_Garage,...,ratio_OfficeArea,ratio_RetailArea,ratio_GarageArea,ratio_StrgeArea,ratio_FactryArea,ratio_OtherArea,avg_NumFloors,avg_YearBuilt,avg_BldgArea,avg_UnitArea
0,2013,1,1000100,0,1000100,1145016.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1900.0,572508.0,0.0
1,2013,1,1000201,0,1000201,1592632.0,43.0,1055.0,1089.0,0.0,...,0.001067,0.010748,0.0,0.0,0.0,0.422769,4.780488,1912.138889,38844.68,35.670049
2,2013,1,1000202,1,1000202,4111815.0,92.0,3568.0,3638.0,0.0,...,0.008192,0.013289,0.002398,0.000365,0.0,0.216718,4.915179,1928.875,73425.27,20.182866
3,2013,1,1000500,0,1000500,5721187.0,194.0,0.0,5.0,0.0,...,0.003696,0.000787,0.0,0.0,0.0,0.995517,1.2,1923.0,1144237.0,228847.48
4,2013,1,1000600,0,1000600,6327663.0,189.0,6051.0,6299.0,0.0,...,0.008846,0.036861,0.005413,0.00315,0.0,0.192012,5.612121,1926.380645,38349.47,6.088184


In [37]:
print(merged_annual_fires.shape)
merged_annual_fires.head()

(12954, 56)


Unnamed: 0,year,tract,count,BoroCT2010,tot_BldgArea,tot_NumBldgs,tot_UnitsRes,tot_UnitsTotal,tot_Ext_Exten_Garage,tot_Ext_Extension,...,ratio_OfficeArea,ratio_RetailArea,ratio_GarageArea,ratio_StrgeArea,ratio_FactryArea,ratio_OtherArea,avg_NumFloors,avg_YearBuilt,avg_BldgArea,avg_UnitArea
0,2013,1000100,0,1000100,1145016.0,24.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1900.0,572508.0,0.0
1,2013,1000201,0,1000201,1592632.0,43.0,1055.0,1089.0,0.0,17.0,...,0.001067,0.010748,0.0,0.0,0.0,0.422769,4.780488,1912.138889,38844.68,35.670049
2,2013,1000202,5,1000202,4111815.0,92.0,3568.0,3638.0,0.0,8.0,...,0.008192,0.013289,0.002398,0.000365,0.0,0.216718,4.915179,1928.875,73425.27,20.182866
3,2013,1000500,0,1000500,5721187.0,194.0,0.0,5.0,0.0,0.0,...,0.003696,0.000787,0.0,0.0,0.0,0.995517,1.2,1923.0,1144237.0,228847.48
4,2013,1000600,14,1000600,6327663.0,189.0,6051.0,6299.0,0.0,28.0,...,0.008846,0.036861,0.005413,0.00315,0.0,0.192012,5.612121,1926.380645,38349.47,6.088184


In [292]:
merged_tract_fires.to_csv('./merged_fire_tract_data.csv', index = False)

In [38]:
merged_annual_fires.to_csv('./merged_annual_fire_data.csv', index = False)