A pipeline to aggregate census-tract level data of social vulnerability information to the county level and re-evaluating thresholds for disadvantaged counties.

Dataset: https://screeningtool.geoplatform.gov/en/methodology#3/33.47/-97.5

In [None]:
import math
from pathlib import Path
import os

import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ruptures as rpt
import pandas as pd
import vaex as vx
import geopandas as gpd

color_pal = sns.color_palette()
sns.set_palette("Set2")

# HOME = Path(os.environ['HOME'])
HOME = Path("/notebooks")
# HOME = Path("/mnt/c/Users/isabe")
# PROJECT = HOME / "Documents/repos/eagle-comp"
PROJECT = HOME / "eagle-comp"
DATA = PROJECT / "data"
EAGLE_DATA = Path("/datasets/eagle-comp")

In [None]:
communities = pd.read_csv(DATA / "social-vulnerability/1.0-communities.csv", index_col=0)

  communities = pd.read_csv(DATA / "social-vulnerability/1.0-communities.csv", index_col=0)


In [None]:
def agg_to_county(df, population_col):
    df["county_fips"] = df.index // 1_000_000 
    df.groupby("county_fips")
    # group by county_fips, take mean of numeric columns and mode of categorical columns
    # communities.drop(["County Name", "State/Territory"], axis=1).groupby("county_fips").mean()
    # df.groupby("county_fips").agg(lambda x: x.value_counts().index[0] if x.dtype == "object" else x.mean())
    def agg(col):
        # need to identify which datatypes can be summed up, and which are averaged
        populations = df.loc[col.index, population_col]
        col[col.isna()] = 0
        # this should be weighted average depending either on area or population
        return (col * populations).sum() / populations.sum()
    return df.groupby("county_fips").agg(agg)

In [None]:
climate_change = [
    "Expected agricultural loss rate (Natural Hazards Risk Index)",
    "Expected building loss rate (Natural Hazards Risk Index)",
    "Expected population loss rate (Natural Hazards Risk Index)",
    "Share of properties at risk of flood in 30 years",
    "Share of properties at risk of fire in 30 years"]

energy = [
    "Energy burden",
    "PM2.5 in the air"
]

health = [
    "Current asthma among adults aged greater than or equal to 18 years",
    "Diagnosed diabetes among adults aged greater than or equal to 18 years",
    "Coronary heart disease among adults aged greater than or equal to 18 years",
    "Low life expectancy (percentile)",
]

housing = [
    "Tract experienced historic underinvestment",
    # "Housing burden (percent)",
    "Median value ($) of owner-occupied housing units",
    "Share of the tract\'s land area that is covered by impervious surface or cropland as a percent",
    # "Greater than or equal to the 90th percentile for share of the tract\'s land area that is covered by impervious surface or cropland as a percent and is low income?",
    "Share of homes with no kitchen or indoor plumbing (percent)",
    "Percent pre-1960s housing (lead paint indicator)"
]

legacy_pollution = [
    "Is there at least one abandoned mine in this census tract, where missing data is treated as False?",
    "Is there at least one Formerly Used Defense Site (FUDS) in the tract, where missing data is treated as False?",
    "Proximity to hazardous waste sites",
    "Proximity to NPL (Superfund) sites",
    "Proximity to Risk Management Plan (RMP) facilities",
]

transportation = [
    "Diesel particulate matter exposure",
    "DOT Travel Barriers Score (percentile)",
    "Traffic proximity and volume"
] 

water_and_wastewater = [
    "Leaky underground storage tanks",
    "Wastewater discharge",
]

workforce_development = [
    "Linguistic isolation (percent)",
    "Median household income as a percent of area median income",
    # "Low median household income as a percent of area median income",
    "Percent of individuals < 100% Federal Poverty Line",
    "Unemployment (percent)"
]

low_income = [
    "Percent of individuals below 200% Federal Poverty Line",
]

high_school_education = [
    "Percent individuals age 25 or over with less than high school degree"
]

total_population = [
    "Total population"
]

features = climate_change + energy + health + housing + legacy_pollution \
    + transportation + water_and_wastewater + workforce_development \
    + low_income + high_school_education + total_population

In [None]:
filtered = communities.loc[:, features]
filtered.loc[filtered["Tract experienced historic underinvestment"].isna(), "Tract experienced historic underinvestment"] = False
filtered.loc[:, "Tract experienced historic underinvestment"] = filtered["Tract experienced historic underinvestment"].astype(bool)
filtered["DOT Travel Barriers Score (percentile)"] = filtered["DOT Travel Barriers Score (percentile)"] > 90
bool_columns = filtered.columns[filtered.dtypes == bool]
filtered[bool_columns] = filtered[bool_columns].astype(int)
socio_counties = agg_to_county(filtered, "Total population")

  filtered.loc[:, "Tract experienced historic underinvestment"] = filtered["Tract experienced historic underinvestment"].astype(bool)
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / populations.sum()
  return (col * populations).sum() / popu

In [None]:
from scipy.stats import percentileofscore
# get percentile values for each county
percentile_county_housing = socio_counties[housing].apply(lambda x: x.apply(lambda y: percentileofscore(x, y)))
percentile_low_income = socio_counties[low_income].apply(lambda x: x.apply(lambda y: percentileofscore(x, y)))
disadvantaged_housing = ((percentile_county_housing > 90).any(axis=1) & (percentile_low_income >= 65).T).T \
    .where(lambda x: x == True).dropna().index.tolist()

percentile_county_transportation = socio_counties[transportation].apply(lambda x: x.apply(lambda y: percentileofscore(x, y)))
disadvantaged_transportation = ((percentile_county_transportation > 90).any(axis=1) & (percentile_low_income >= 65).T).T \
    .where(lambda x: x == True).dropna().index.tolist()

percentile_workforce_development = socio_counties[workforce_development].apply(lambda x: x.apply(lambda y: percentileofscore(x, y)))
high_school_attainment = socio_counties[high_school_education]
disadvantaged_workforce_development = ((percentile_workforce_development > 90).any(axis=1) & (high_school_attainment < 10).T).T \
    .where(lambda x: x == True).dropna().index.tolist()

percentile_energy = socio_counties[energy].apply(lambda x: x.apply(lambda y: percentileofscore(x, y)))
disadvantaged_energy = ((percentile_energy > 90).any(axis=1) & (percentile_low_income >= 65).T) \
    .where(lambda x: x == True).dropna().index.tolist()