In [217]:
import geopandas as gp
import geocoder
import pandas as pd
import re
import numpy as np
import os


In [218]:
if os.path.exists('../processed_data/fdny_gas.csv'):
    # SAME FROM EXPLORATION, ONLY NEEDS TO BE RUN THE FIRST TIME
    fdny_gas = pd.read_csv('../processed_data/fdny_gas.csv', dtype={'ZIP_CODE': str})
else:
    fdny = pd.read_csv("../raw_data/Incidents_Responded_to_by_Fire_Companies.csv",
                       usecols=['IM_INCIDENT_KEY','INCIDENT_TYPE_DESC',
                                'ZIP_CODE','CO_DETECTOR_PRESENT_DESC', 
                                'STREET_HIGHWAY', 'BOROUGH_DESC'],
                       dtype={'ZIP_CODE':str})

    # split incident description to get code, filter
    def code_split(data):
        a = data.split(' -')
        return a[0]

    fdny['incident_code'] = fdny.INCIDENT_TYPE_DESC.apply(lambda x: code_split(x))

    fdny['BOROUGH_DESC'] = fdny['BOROUGH_DESC'].apply(lambda x: x.split('-', 1)[1]).str.replace('Manhattan', 'New York')
    fdny['BOROUGH_DESC'] = fdny['BOROUGH_DESC'].str.replace('Brooklyn', 'Kings')
    fdny['BOROUGH_DESC'] = fdny['BOROUGH_DESC'].str.replace('Staten Island', 'RICHMOND')

    fdny['ADDRESS_W_ZIP'] = fdny['STREET_HIGHWAY'] + ","+ fdny['BOROUGH_DESC'] + ', '  + fdny['ZIP_CODE']


    fdny_gas = fdny[fdny.incident_code=='412']

    fdny_gas.to_csv('../processed_data/fdny_gas.csv')

In [219]:
# Gather zip codes from fdny for selection later
nyc_zips = list(fdny_gas.ZIP_CODE.unique())

## Shapefile > Merge with Pluto

1. Work with shapefile of roads within NYS. 
2. Reduce dataset to NYC.
3. Normalize FDNY Road Naming Convention (WIP ~92%)
4. Disolve by street name, county, and zip code, creating segments of roads associated with a specific zip code
5. Create buffer around desolved street
6. Perform spatial selection to identify geographic features that intersect street buffer
7. Associate 

In [220]:
# define your directories for input and output shapefiles
# NYS ROADS SHAPEFILE FROM: http://gis.ny.gov/gisdata/inventories/details.cfm?DSID=932

dir_input = '../raw_data/Streets_shp/'
name_in = 'StreetSegment.shp'
dir_output = '../processed_data/Streets_shp/'
name_out = 'nycLeftStreetZip.shp'

In [221]:
if os.path.exists(dir_output + name_out):
    streets_dissolved = gp.GeoDataFrame.from_file(dir_output + name_out)
else:
    

    # open your file with geopandas
    all_streets = gp.GeoDataFrame.from_file(dir_input + name_in)

    nyc_counties = ['Queens', 'New York', 'Richmond', 'Bronx', 'Kings']

    streets = all_streets.loc[:,['LeftCounty', 'RightCount', 'LeftCensus', 'RightCensu', 'LeftPostal', 'RightPosta', 'Label', 'geometry']]

    streets['LeftPostal'] = streets['LeftPostal'].astype(str)
    streets['RightPosta'] = streets['RightPosta'].astype(str)

    # select all roads where county in NYC or ZIP CODE in NYC
    nyc_streets = streets[(streets['RightCount'].isin(nyc_counties)) | (streets['LeftCounty'].isin(nyc_counties)) | (streets['LeftPostal'].isin(nyc_zips)) | (streets['RightPosta'].isin(nyc_zips))]

    nyc_streets.loc[:,'LEFT_STREET_NAME'] = nyc_streets.loc[:,'Label'].str.upper() + ', ' + nyc_streets.loc[:,'LeftCounty'].str.upper() + ', ' + nyc_streets.loc[:,'LeftPostal']
    nyc_streets.loc[:,'RIGHT_STREET_NAME'] = nyc_streets.loc[:,'Label'].str.upper() + ', ' + nyc_streets.loc[:,'RightCount'].str.upper() + ', ' + nyc_streets.loc[:,'RightPosta']

    # GROUP EACH INTO LIST FOR LOOPING/SHAPEFILE CREATION
    geom = list(nyc_streets['geometry'].values)
    left_str = list(nyc_streets['LEFT_STREET_NAME'].values)
    # right_str = list(nyc_streets['RIGHT_STREET_NAME'].values)

    # dictionaries to hold line segments
    left_streets = {}
    right_streets = {}

    # Disolve roads with same name county and zip to a single feature

    for i in range(len(geom)):
        left_street = left_str[i]
    #     right_street = right_str[i]
        geometry = geom[i]

        # if the feature's street doesn't yet exist, create it and assign a list
        if left_street not in left_streets:
            left_streets[left_street] = []
        # append the feature to the list of features
        left_streets[left_street].append(geometry)


    #     # if the feature's street doesn't yet exist, create it and assign a list
    #     if right_street not in right_streets:
    #         right_streets[right_street] = []
    #     # append the feature to the list of features
    #     right_streets[right_street].append(geometry)

    # create a geopandas geodataframe, with columns for street and geometry
    streets_dissolved = gp.GeoDataFrame(columns=['left_street', 'geometry'], crs=all_streets.crs)

    # iterate your dictionary
    for street, street_list in left_streets.items():
        # create a geoseries from the list of features
        geometry = gp.GeoSeries(street_list)
        # use unary_union to join them, thus returning polygon or multi-polygon
        geometry = geometry.unary_union
        # set your street and geometry values
        streets_dissolved.set_value(street, 'left_street', street)
        streets_dissolved.set_value(street, 'geometry', geometry)

    # save to file
    streets_dissolved.to_file(dir_output + name_out, driver="ESRI Shapefile")

In [222]:
def replacement(rep, value):
    if value in rep:
        value = rep[value]
    return value

In [223]:
# Clean_address removes extension on numbered street. i.e. 2ND ST turns to 2 ST.
def clean_address(add):
    add = str(add)
    add = re.sub(' +',' ', add)
    string = []
    for i in add.split(' '):
        num = re.sub('[^0-9]','', i)
        if (num != '') & (',' not in i):
            string.append(num)
        else:
            
            string.append(i)
    return " ".join(string).strip()


In [224]:
# Preproc of Address naming covention prior to address cleaning

fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.upper()
fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.replace('ST NICHOLAS', 'SAINT NICHOLAS')
fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.replace('.', '')
fdny_gas['ADDRESS_W_ZIP'] = fdny_gas['ADDRESS_W_ZIP'].str.replace('ST MARKS', 'SAINT MARKS')


preproc={'BRIGHTON12 ST, KINGS, 11235': 'BRIGHTON 12 ST, KINGS, 11235',
         'BRIGHTON3 PL, KINGS, 11235': 'BRIGHTON 3 PL, KINGS, 11235',
         'BRIGHTON3 ST, KINGS, 11235': 'BRIGHTON 3 ST, KINGS, 11235',
         'BRIGHTON5 ST, KINGS, 11235': 'BRIGHTON 5 ST, KINGS, 11235',
         'BRIGHTON6 ST, KINGS, 11235': 'BRIGHTON 6 ST, KINGS, 11235',
         'BRIGHTON7 ST, KINGS, 11235': 'BRIGHTON 7 ST, KINGS, 11235',
         'BRIGHTON8 CT, KINGS, 11235': 'BRIGHTON 8 CT, KINGS, 11235',
         'BRIGHTONBEACH AVE, KINGS, 11235': 'BRIGHTON BEACH AVE, KINGS, 11235',
         'HARLEM RIVER DR W, NEW YORK, 10032': 'HARLEM RIVER DRWY, NEW YORK, 10033'}

fdny_gas['ADDRESS_W_ZIP'] = np.vectorize(replacement)(preproc, fdny_gas['ADDRESS_W_ZIP'])

In [225]:
fdny_gas.loc[:,'left_street'] = fdny_gas['ADDRESS_W_ZIP'].apply(clean_address)

In [226]:
streets_dissolved.loc[:,'left_street'] = streets_dissolved['left_stree'].apply(clean_address)

In [227]:
replacements = {'DOUGLASS BLVD, NEW YORK, 10039': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10039',
                'GRANDCONCOURSE, BRONX, 10458': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10039',
                'AVED, NEW YORK, 10009': 'AVENUE D, NEW YORK, 10009',
                'F D R DR W, NEW YORK, 10009': 'FDR DR, NEW YORK, 10009',
                'F D R DR W, NEW YORK, 10002': 'FDR DR, NEW YORK, 10002',
                'FDR DR W, NEW YORK, 10009': 'FDR DR, NEW YORK, 10009',
                'FDR DR W, NEW YORK, 10002': 'FDR DR, NEW YORK, 10002',
                'E 116 BLVD, NEW YORK, 10035': 'E 116 ST, NEW YORK, 10035',
                'DOUGLASS BLVD, NEW YORK, 10030': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10030',
                'GASTON-BLVD-STONE AVE, KINGS, 11212': 'MOTHER GASTON BLVD, KINGS, 11212',
                'POWELL BLVD, NEW YORK, 10030': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10030',
                'ST JOHNS PL, KINGS, 11213': 'SAINT JOHNS PL, KINGS, 11213',
                'AVEC, NEW YORK, 10009': 'AVENUE C, NEW YORK, 10009',
                'GRANDCONCOURSE, BRONX, 10456': 'GRAND CONCOURSE, BRONX, 10456',
                'GRANDCONCOURSE, BRONX, 10453': 'GRAND CONCOURSE, BRONX, 10453',
                'GRANDCONCOURSE, BRONX, 10452': 'GRAND CONCOURSE, BRONX, 10452',
                'GRANDCONCOURSE, BRONX, 10451': 'GRAND CONCOURSE, BRONX, 10451',
                'GRANDCONCOURSE, BRONX, 10457': 'GRAND CONCOURSE, BRONX, 10457',
                'BELT PKWY N, KINGS, 11236': 'BELT PKWY, KINGS, 11236',
                'BELT PKWY N, KINGS, 11214': 'BELT PKWY, KINGS, 11214',
                'BELT PKWY S, KINGS, 11214': 'BELT PKWY, KINGS, 11214',
                'BELT PKWY N, KINGS, 11235': 'BELT PKWY, KINGS, 11235',
                'BELT PKWY S, KINGS, 11235': 'BELT PKWY, KINGS, 11235',
                'BELT PKWY N, KINGS, 11228': 'BELT PKWY, KINGS, 11228',
                'BELT PKWY S, KINGS, 11228': 'BELT PKWY, KINGS, 11228',
                'AVE W, KINGS, 11229': 'AVENUE W, KINGS, 11229',
                'AVE W, KINGS, 11223': 'AVENUE W, KINGS, 11223',
                'ST MARKS AVE, KINGS, 11213': 'SAINT MARKS AVE, KINGS, 11213',
                'STMARKS AVE, KINGS, 11213': 'SAINT MARKS AVE, KINGS, 11213',
                'ST MARKS AVE, KINGS, 11233': 'SAINT MARKS AVE, KINGS, 11233',
                'ST MARKS AVE, KINGS, 11216': 'SAINT MARKS AVE, KINGS, 11216',
                'ST MARKS AVE, KINGS, 11238': 'SAINT MARKS AVE, KINGS, 11238',
                'ST MARKS AVE, KINGS, 11217': 'SAINT MARKS AVE, KINGS, 11217',
                'MALCOLMX BLVD, NEW YORK, 10030': 'MALCOLM X BLVD, NEW YORK, 10030',
                'LENOX AVE, NEW YORK, 10030': 'MALCOLM X BLVD, NEW YORK, 10030',
                'LENOX AVE, NEW YORK, 10027': 'MALCOLM X BLVD, NEW YORK, 10037',
                'LENOX AVE, NEW YORK, 10027': 'MALCOLM X BLVD, NEW YORK, 10027',
                'UNIVERSITY AVE, BRONX, 10452': 'DR MARTIN LUTHER KING JR BLVD, BRONX, 10452',
                'UNIVERSITY AVE, BRONX, 10453': 'DR MARTIN LUTHER KING JR BLVD, BRONX, 10452',
                'W 91 ST, NEW YORK, 10025': 'W 91 ST, NEW YORK, 10024',
                'DOUGLASS BLVD, NEW YORK, 10027': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10027',
                'DOUGLASS BLVD, NEW YORK, 10026': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10026',
                'DOUGLASS BLVD, NEW YORK, 10030': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10030',
                'DOUGLASS BLVD, NEW YORK, 10039': 'FREDERICK DOUGLASS BLVD, NEW YORK, 10039',
                'E 78 ST, NEW YORK, 10021': 'E 78 ST, NEW YORK, 10075',
                'E 77 ST, NEW YORK, 10021': 'E 77 ST, NEW YORK, 10075', 
                '7 AVE, NEW YORK, 10039': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10039',
                '7 AVE, NEW YORK, 10026': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10026',
                '7 AVE, NEW YORK, 10027': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10027',
                '7 AVE, NEW YORK, 10030': 'ADAM CLAYTON POWELL JR BLVD, NEW YORK, 10030',
                'BRUCKNER BLVD SB, BRONX, 10472': 'BRUCKNER BLVD, BRONX, 10472',
                'E 61 ST, NEW YORK, 10021': 'E 61 ST, NEW YORK, 10065',
                'BEACH 54 ST, QUEENS, 11691': 'BEACH 54 ST, QUEENS, 11692',
                'BEACH 121 ST, QUEENS, 11414': 'BEACH 121 ST, QUEENS, 11694',
                'BEACH 73RD ST, QUEENS, 99999': 'BEACH 73RD ST, QUEENS, 11692'}

In [228]:
streets_dissolved[streets_dissolved.left_stree.str.contains('E 1ST')]

Unnamed: 0,geometry,left_stree,left_street
5043,"(LINESTRING (585227.6399999999 4508637.87, 585...","E 1ST ST, NEW YORK, 10003","E 1 ST, NEW YORK, 10003"
5047,"(LINESTRING (585552.22 4508464.09, 585593.95 4...","E 1ST ST, NEW YORK, 10009","E 1 ST, NEW YORK, 10009"
11222,"(LINESTRING (587255.1100000001 4493592.25, 587...","E 1ST ST, KINGS, 11223","E 1 ST, KINGS, 11223"
15469,"LINESTRING (585106.2 4508702.54, 585094.23 450...","E 1ST ST, NEW YORK, 10012","E 1 ST, NEW YORK, 10012"


In [229]:
fdny_gas['left_street'] = np.vectorize(replacement)(replacements, fdny_gas['left_street'])

In [230]:
fdny_gas.loc[:,'left_street2'] = fdny_gas.loc[:,'left_street']

fdny_gas['left_street'] = fdny_gas['left_street'].str.replace(' ', '')
streets_dissolved['left_street'] = streets_dissolved['left_street'].str.replace(' ', '')

In [231]:
# count = 0
# b=list(fdny_gas.left_stree.values)
# for i in streets_dissolved.left_stree.values:
#     if i in b:
#         count +=1

In [232]:
fdny_gas = fdny_gas.merge(streets_dissolved, how='left', on='left_street')

In [233]:
print "Percent geo-associated: %s" % str(round((len(fdny_gas[~fdny_gas['geometry'].isnull()])/(len(fdny_gas)*1.)*100.),2))

Percent geo-associated: 92.85


In [234]:

len(fdny_gas[fdny_gas['geometry'].isnull()])

4339

In [235]:
temp = pd.DataFrame(fdny_gas[(fdny_gas['geometry'].isnull())]['left_street'].value_counts()).reset_index()


In [236]:
temp.head()

Unnamed: 0,index,left_street
0,"HARLEMRIVERDRW,NEWYORK,10032",40
1,"CO-OPCITYBLVD,BRONX,10475",36
2,"E1ST,NEWYORK,10002",32
3,"MONUMENTWALK,KINGS,11201",31
4,"AVEA,NEWYORK,10009",30


In [237]:
num_g = 10
print temp[temp['left_street'] > num_g].sum()[1], len(temp[temp['left_street'] > num_g])

1715 98


In [238]:
# buffer of 50 selected arbitrarily. 
# May need to be tweaked, but a visual inspection looks promising.

buf = streets_dissolved.buffer(50)



In [239]:
# replace geometry field with buffer geometry field. this will retain the name of each road
streets_dissolved['geometry'] = buf


In [240]:
# write to street segment buffer shape file

streets_dissolved.to_file('../processed_data/street_buffer', driver="ESRI Shapefile")

In [241]:
# Performs 

# streets_dissolved = gp.GeoDataFrame.from_file('../processed_data/street_buffer/New_street_buffer.shp')

# county_code = ['081', '047', '061', '085', '005']

# bg = gp.GeoDataFrame.from_file('../raw_data/BlockGroups/tl_2015_36_bg.shp', crs=all_streets.crs)

# nyc_bg = bg[bg['COUNTYFP'].isin(county_code)]

# nyc_bg.to_file('../raw_data/nyc_block_groups', driver='ESRI Shapefile')

# nyc_bg.crs == left_street.crs

# temp = gp.sjoin(left_street, nyc_bg, how='left', op='intersects')

# temp.to_file('../raw_data/test', driver='ESRI Shapefile')

# temp[temp.left_stree.str.contains('AUSTIN ST')]#['ALAND'].sum()

# temp = gp.sjoin(left_street, nyc_bg, how='left')
# F = temp.groupby('left_stree')['geometry', 'ALAND'].agg(['sum'])

# pointSumByPoly[pointSumByPoly.index.str.contains('AUSTIN ST')]

In [242]:
def make_float(expected_float):
    try:
        if type(expected_float) == str:
            expected_float = expected_float.replace('+', '').replace(',', '')
        return float(expected_float)
    except:
        # print expected_float
        return np.nan

In [243]:
# JOINING CENSUS TRACT DATA

ct_race = pd.read_csv('../raw_data/CENSUS_TRACT_RACE_INCOME/ACS_15_5YR_DP05_with_ann.csv', skiprows=[1],
                      usecols=['GEO.id2', 'HC01_VC03', 'HC01_VC49', 'HC01_VC50', 'HC01_VC51',
                               'HC01_VC56', 'HC01_VC64', 'HC01_VC69', 'HC01_VC23'])

# should add two or more race categories to get 100% of data...
ct_race.rename(columns={'HC01_VC03': 'TOTAL_POPULATION', 'HC01_VC49': 'WHITE',
                        'HC01_VC50': 'BLACK_AFRICAN_AMERICAN', 'HC01_VC51': 'AMERICAN_INDIAN_AND_ALASKA_NATIVE',
                        'HC01_VC56': 'ASIAN', 'HC01_VC64': 'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
                        'HC01_VC69': 'SOME_OTHER_RACE', 'HC01_VC23': 'MEDIAN_AGE', 'GEO.id2': 'GEOID'}, inplace=True)

for i in ct_race.columns:
    if i != 'GEOID':
        ct_race[i] = ct_race[i].apply(lambda x: make_float(x))

# NOT NEEDED UNTIL LATER
# for i in ct_race.columns[3:]:
#     ct_race[i] = ct_race[i]/(ct_race['TOTAL_POPULATION']*1.)

In [244]:
ct_income = pd.read_csv('../raw_data/CENSUS_TRACT_RACE_INCOME/ACS_15_5YR_S1901_with_ann.csv', 
                        skiprows=[1], usecols=['GEO.id2', 'HC01_EST_VC01', 'HC01_EST_VC15'])

# Mean selected to preform aggregation in groupby -> median would not provide accurate calculations
ct_income.rename(columns={'HC01_EST_VC01': 'TOTAL_HOUSEHOLDS', 'HC01_EST_VC15': 'MEAN_INCOME', 'GEO.id2': 'GEOID'}, inplace=True)

for i in ct_income.columns:
    if i != 'GEOID':
        ct_income[i] = ct_income[i].apply(lambda x: make_float(x))

ct_income.loc[:, 'CENSUS_TRACT_INCOME'] = ct_income['TOTAL_HOUSEHOLDS'] * ct_income['MEAN_INCOME']

del ct_income['MEAN_INCOME']

In [245]:
ct_acs_clean = pd.merge(ct_income, ct_race, how='outer', on='GEOID')

In [246]:
ct = gp.GeoDataFrame.from_file('../raw_data/CENSUS_TRACT_SHAPEFILE/cb_2015_36_tract_500k.shp')
# selecting nyc tracts
county_code = ['081', '047', '061', '085', '005']
ct.loc[:,'GEOID'] = ct.loc[:,'GEOID'].astype(int)
ct = ct[ct['COUNTYFP'].isin(county_code)][['ALAND', 'GEOID', 'geometry']]

In [247]:
gdf_ct_acs = pd.merge(ct, ct_acs_clean, how='left', on='GEOID')

In [248]:
gdf_ct_acs.to_file('../processed_data/ct_w_data', driver='ESRI Shapefile')

In [249]:
# RUN IN QGIS PYTHON CONSOLE. QGIS NOT AVAILABLE THROUGH MAC ANACONDA2 DISTRIBUTION
# REQUIRES LOADING OF STREET DISSOLVE BUFFER & CENSUS TRACTS INTO QGIS THEN RUNNING THE CODE BELOW. AN 
# OUTPUT SHAPEFILE WILL BE GENERATED WITH THE PROPER PROJECTION.

# import os, processing
# from qgis.core import QgsMapLayerRegistry

# # Set crs same as block group below
# crs = 'EPSG:2263'  

# shapefiles = QgsMapLayerRegistry.instance().mapLayers().values() 
# for shapes in shapefiles:
#     myfilepath = shapes.dataProvider().dataSourceUri()
#     (myDirectory,nameFile) = os.path.split(myfilepath)
#     processing.runalg("qgis:reprojectlayer", shapes, crs, myDirectory + "/" + "New_" + shapes.name())

In [250]:
# load geo data with proper nyc projections. they are now both epsg:2263
gdf_ct_acs = gp.GeoDataFrame.from_file('../raw_data/ct_w_data/New_ct_w_data.shp')
streets_dissolved_buffer = gp.GeoDataFrame.from_file('../raw_data/street_buffer/New_street_buffer.shp')

In [251]:
# creates a row associated with left_stree for every time a census tract intersects the road segment buffer
temp = gp.sjoin(streets_dissolved_buffer, gdf_ct_acs, how='left')


In [253]:
sum_temp = temp.groupby('left_str_1')['geometry', 'MEDIAN_AGE'].agg(['sum', 'count'])

# mean of median values not best practice, but only way to quantify across census tracts
sum_temp['MEDIAN_AGE_BUFFER'] = list(sum_temp.reset_index().MEDIAN_AGE.loc[:,'sum'] / sum_temp.reset_index().MEDIAN_AGE.loc[:,'count'])


In [254]:
sum_temp2 = temp.groupby('left_str_1')['geometry', 'ALAND','TOTAL_POPU', 
                                      'TOTAL_HOUS', 'WHITE', 'BLACK_AFRI',
                                       'ASIAN', 'NATIVE_HAW',
                                       'AMERICAN_I', 'SOME_OTHER'].agg('sum')

In [255]:
for i in sum_temp2.columns[3:]:
    sum_temp2[i] = sum_temp2[i]/(sum_temp2['TOTAL_POPU']*1.)
    
sum_temp = sum_temp[['MEDIAN_AGE_BUFFER']].reset_index()
sum_temp2 = sum_temp2.reset_index()

In [256]:
street_acs = pd.merge(sum_temp, sum_temp2, how='left', on='left_str_1')

In [257]:
del street_acs[street_acs.columns[1]]

In [258]:
street_acs.rename(columns={street_acs.columns[1]: 'MEDIAN_AGE_BUFFER'}, inplace=True)
# convert from square meters to square miles
street_acs.loc[:,'ALAND'] = street_acs.loc[:,'ALAND']*0.000000386102159

In [259]:
street_acs.loc[:,'POPULATION_DENSITY'] =street_acs.loc[:,'TOTAL_POPU']/street_acs.loc[:,'ALAND']
for i in ['ALAND', 'TOTAL_POPU', 'TOTAL_HOUS']:
    del street_acs[i]

In [260]:
fdny_final = fdny_gas.groupby('left_street')['IM_INCIDENT_KEY'].agg('count').reset_index()

In [262]:
fdny_final.columns = ['left_str_1', 'NUM_LEAKS']

In [263]:
fdny_final = fdny_final.merge(street_acs, how='left', on='left_str_1')

In [264]:
fdny_final.sort_values(by='NUM_LEAKS', ascending=False)

Unnamed: 0,left_str_1,NUM_LEAKS,MEDIAN_AGE_BUFFER,WHITE,BLACK_AFRI,ASIAN,NATIVE_HAW,AMERICAN_I,SOME_OTHER,POPULATION_DENSITY
4903,"FREDERICKDOUGLASSBLVD,NEWYORK,10039",220,32.116667,0.121834,0.662401,0.010818,0.000325,0.008719,0.142967,99356.725181
3539,"COLUMBUSAVE,NEWYORK,10025",161,40.883333,0.511465,0.233593,0.082701,0.000000,0.002329,0.122043,108165.618149
3890,"DRMARTINLUTHERKINGJRBLVD,BRONX,10452",139,30.450000,0.118605,0.380908,0.002923,0.000000,0.003443,0.459285,70000.948401
3938,"E102ST,NEWYORK,10029",124,32.520000,0.342441,0.313374,0.064290,0.000000,0.003166,0.253247,19083.939930
4020,"E143ST,BRONX,10451",123,27.566667,0.145436,0.344460,0.005563,0.000000,0.006698,0.482119,47675.814976
8451,"WEBSTERAVE,BRONX,10456",122,28.911111,0.076282,0.430546,0.004584,0.000000,0.003366,0.461895,63833.366807
525,"159ST,QUEENS,11433",117,32.628571,0.065114,0.652283,0.077463,0.000000,0.005741,0.177846,27321.111709
3903,"DUMONTAVE,KINGS,11212",115,30.025000,0.060111,0.808067,0.012259,0.000000,0.005569,0.098174,52857.468945
3944,"E105ST,NEWYORK,10029",113,33.733333,0.323223,0.304599,0.073000,0.003114,0.003344,0.266657,22469.539945
4977,"GATESAVE,KINGS,11221",111,32.170000,0.247901,0.610771,0.041637,0.000000,0.004091,0.066533,58745.420667


In [265]:
fdny_final.corr()

Unnamed: 0,NUM_LEAKS,MEDIAN_AGE_BUFFER,WHITE,BLACK_AFRI,ASIAN,NATIVE_HAW,AMERICAN_I,SOME_OTHER,POPULATION_DENSITY
NUM_LEAKS,1.0,-0.192267,-0.166836,0.146766,-0.115668,0.010082,0.087171,0.16544,0.301137
MEDIAN_AGE_BUFFER,-0.192267,1.0,0.351516,-0.260049,0.266613,0.020441,-0.153138,-0.470351,-0.362019
WHITE,-0.166836,0.351516,1.0,-0.762332,-0.019891,-0.064838,-0.233556,-0.498991,-0.231494
BLACK_AFRI,0.146766,-0.260049,-0.762332,1.0,-0.440295,0.019311,0.063037,0.083427,0.073552
ASIAN,-0.115668,0.266613,-0.019891,-0.440295,1.0,0.00157,0.019627,-0.180979,-0.020586
NATIVE_HAW,0.010082,0.020441,-0.064838,0.019311,0.00157,1.0,0.001703,0.070334,0.003008
AMERICAN_I,0.087171,-0.153138,-0.233556,0.063037,0.019627,0.001703,1.0,0.276238,0.112928
SOME_OTHER,0.16544,-0.470351,-0.498991,0.083427,-0.180979,0.070334,0.276238,1.0,0.32418
POPULATION_DENSITY,0.301137,-0.362019,-0.231494,0.073552,-0.020586,0.003008,0.112928,0.32418,1.0
