In [2]:
import pandas as pd

In [3]:
# Reading in filtered EPA data to NYC specifically 
EPA_NYC_df = pd.read_csv('github_data/EPA_NYC_out.csv')
EPA_NYC_df.head()

Unnamed: 0,OBJECTID,GEOID10,GEOID20,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,CSA,CSA_Name,CBSA,...,D5DRI,D5DE,D5DEI,D2A_Ranked,D2B_Ranked,D3B_Ranked,D4A_Ranked,NatWalkInd,Shape_Length,Shape_Area
0,148837,360810000000.0,360810000000.0,36,81,25900,2,408.0,"New York-Newark, NY-NJ-CT-PA",35620.0,...,0.506986,0.000222,0.528185,19,15,17,18,17.333333,1016.744396,64619.12044
1,148838,360810000000.0,360810000000.0,36,81,16600,2,408.0,"New York-Newark, NY-NJ-CT-PA",35620.0,...,0.02918,6.1e-05,0.14427,9,9,15,19,14.333333,1485.032311,91377.23524
2,148839,360811000000.0,360811000000.0,36,81,122702,1,408.0,"New York-Newark, NY-NJ-CT-PA",35620.0,...,0.15478,6.7e-05,0.159039,1,1,10,18,9.666667,1110.284578,72542.55913
3,148840,360810000000.0,360810000000.0,36,81,28300,5,408.0,"New York-Newark, NY-NJ-CT-PA",35620.0,...,0.535719,0.000238,0.567026,5,20,1,19,10.833333,1096.360269,36881.7998
4,148841,360811000000.0,360811000000.0,36,81,148300,4,408.0,"New York-Newark, NY-NJ-CT-PA",35620.0,...,0.082963,3.6e-05,0.084785,14,12,19,14,15.333333,2652.063831,286388.7504


Below, we have 2167 census tracts for NYC in the dataset (according to Google there were 2168 in 2019, so this tracks). Note that this data is from 2019, before the census updates in 2020 (and pre-covid which is what Prof. recommended)

In [4]:
EPA_NYC_df[['TRACTCE', 'COUNTYFP']].head()

Unnamed: 0,TRACTCE,COUNTYFP
0,25900,81
1,16600,81
2,122702,81
3,28300,81
4,148300,81


NOTE: The census tract of a row is essentially the combination of TRACTCE and COUNTYFP

In [31]:
unique_groups = EPA_NYC_df.groupby(['TRACTCE', 'COUNTYFP']).ngroups
print("Number of unique tracts:", unique_groups)
unique_block_groups = EPA_NYC_df.groupby(['TRACTCE', 'COUNTYFP', 'BLKGRPCE']).ngroups
print("Number of unique block groups:", unique_block_groups)

Number of unique tracts: 2167
Number of unique block groups: 6493


## Next Step: Data visualizations and statistics
- 1: pick features we wanna examine (mainly from EPA dataset) - DONE
- 2: group by the 2167 census tracts (or not, can currently keep as 2018 census block group) - Probably unnecessary
    - figure out how to merge certain data when grouping (mean, median, max)
- 3: regression, clustering, pca, correlations, etc. - See other notebooks for this

In [6]:
# # group by census tract, figure out how to aggregate Variables of Interest
# res = EPA_NYC_df.groupby(['TRACTCE', 'COUNTYFP'])

In [8]:
# cleaning some variables (turning totals into ratios)
EPA_NYC_df['Prop_over3kmonth'] = EPA_NYC_df['R_HiWageWk'] / EPA_NYC_df['TotPop']
EPA_NYC_df['Prop_land_unprotected'] = EPA_NYC_df['Ac_Unpr'] / EPA_NYC_df['Ac_Land']


In [10]:
print(EPA_NYC_df['TotEmp'].describe())


count     6493.000000
mean       672.690744
std       3314.438818
min          0.000000
25%         47.000000
50%        141.000000
75%        358.000000
max      73521.000000
Name: TotEmp, dtype: float64


In [24]:
# List of columns to check for NaN values
columns_to_check = ['TotPop', 'Ac_Land', 'R_HiWageWk', 'Ac_Unpr']

# Check each column and print whether it contains NaNs
for column in columns_to_check:
    contains_nan = EPA_NYC_df[column].isna().any()
    print(f"'{column}' contains NaN values: {contains_nan}")

# Count zeros in specific columns
specific_columns = ['TotPop', 'Ac_Land']
zeros_in_specific_columns = EPA_NYC_df[specific_columns].eq(0).sum()
print(zeros_in_specific_columns)

'TotPop' contains NaN values: False
'Ac_Land' contains NaN values: False
'R_HiWageWk' contains NaN values: False
'Ac_Unpr' contains NaN values: False
TotPop     271
Ac_Land    206
dtype: int64


In [27]:
# dropping CBGs with either no population or no land
mask = (EPA_NYC_df[specific_columns] != 0).all(axis=1)
print(len(EPA_NYC_df) - sum(mask))
pre_filtered_df = EPA_NYC_df.loc[mask]


271


In [32]:
# keeping only variables of interest
vars_of_interest = ['TRACTCE', 'COUNTYFP', 'BLKGRPCE',
                    'NatWalkInd', 'Prop_land_unprotected',
                    'D4B025', 'D4A', 'D2B_E5MIXA', 
                    'D2A_JPHH', 'D1D', 'D1C5_ENT', 
                    'D1B', 'E_PctLowWage', 
                    'Prop_over3kmonth', 'R_PCTLOWWAGE',
                    'P_WrkAge']
# vars_of_interest = [v.upper() for v in vars_of_interest]
# for v in vars_of_interest:
#     print(EPA_NYC_df[v].head())
filtered_EPA_df = pre_filtered_df[vars_of_interest]

In [33]:
nan_rows_count = filtered_EPA_df.isna().any(axis=1).sum()
print(nan_rows_count)
nan_columns_count = filtered_EPA_df.isna().any(axis=0).sum()
print(nan_columns_count)
nan_columns = filtered_EPA_df.columns[filtered_EPA_df.isna().any()].tolist()
print(f"Columns with any NaN values: {nan_columns}")


0
0
Columns with any NaN values: []


Now, normalize the data and perform clustering (maybe PCA beforehand)

In [36]:
print(len(filtered_EPA_df))
print(filtered_EPA_df)

6222
      TRACTCE  COUNTYFP  BLKGRPCE  NatWalkInd  Prop_land_unprotected  \
0       25900        81         2   17.333333               1.000000   
1       16600        81         2   14.333333               1.000000   
2      122702        81         1    9.666667               1.000000   
3       28300        81         5   10.833333               1.000000   
4      148300        81         4   15.333333               0.973361   
...       ...       ...       ...         ...                    ...   
6488    22401         5         1   13.166667               0.920363   
6489    38900         5         1   13.666667               1.000000   
6490     7100         5         2   16.833333               1.000000   
6491    21100         5         3   12.666667               1.000000   
6492    19700         5         1   15.000000               1.000000   

        D4B025     D4A  D2B_E5MIXA  D2A_JPHH         D1D  D1C5_ENT  \
0     1.000000  242.74    0.754524  2.256790   85.297717  3.

In [39]:
filtered_EPA_df.to_csv('github_data/EPA_df_for_analysis.csv', index=False)