In [241]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import geopandas as gpd
import pandas as pd
import requests
from datetime import datetime
import numpy as np

pd.set_option('display.max_colwidth', None)

INDICATOR = 'enviroscreen_score'
CONFIG = config.get_config(INDICATOR, '../config.toml')

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))

## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/qol/social_equity/enviroscreen_score](C:/Users/tan/src/regional-pm-2023/data/raw/qol/social_equity/enviroscreen_score)

# Quality of Life: Social Equity

## Social Equity Score

In [242]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

CalEnviroScreen social equity score and population in top percentiles.

nan

In [243]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
region_score,Region Score,"CalEnviroScreen social equity score, in a given year.",float
population_25th,Population above 25 Percentile,"Percentage of residents above 25th percentile of score, in a given year.",float
population_50th,Population above 50 Percentile,"Percentage of residents above 50th percentile of score, in a given year.",float


In [244]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
enviroscreen,CalEnviroscreen,California Office of Environment Health Hazard Assessment,True,


In [245]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Download data from availible releases.
1,Calculate enviroscreen score and percentile populations from availible releases.


In [246]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,,


### Step 0: Download availible releases
I was able to find and download data for CalEnviroscreen versions [4](https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-40), [3](https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-30), and [2](https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-version-20) (I did not find anything archived for the first release).

In [247]:
display(steps.loc[0])

step    Download data from availible releases.
Name: 0, dtype: object

### Step 1: Calculate scores

The region score are the scores of individual tract scores weighted by population.

The populations at 25th and 50th percentile are sums of transects that have the 25th and 50th worst social equity scores respectively.

In [248]:
display(steps.loc[1])

step    Calculate enviroscreen score and percentile populations from availible releases.
Name: 1, dtype: object

In [249]:
version_4 = (
        pd.read_excel(
            CONFIG['raw_dir']
            / (
                'calenviroscreen/'
                '4/'
                'calenviroscreen40resultsdatadictionaryf2021/'
                'calenviroscreen40resultsdatadictionary_F_2021.xlsx'
            )
        )
        .assign(
            tract=lambda df: df['Census Tract'],
            county=lambda df: df['California County'],
            population=lambda df: df['Total Population'],
            score=lambda df: df['CES 4.0 Score'],
            percentile=lambda df: df['CES 4.0 Percentile'],
            year=datetime(2021, 1, 1)
        )
        .dropna(subset=['score'])
        .dropna(subset=['percentile'])
        [
            [
                'tract',
                'county',
                'population',
                'score',
                'percentile',
                'year',
            ]
        ]
        .query("`county` == 'San Diego'")
        .assign(
            pop_top_25_pct=lambda df: (
                df.percentile
                .apply(lambda x: True if x >= 75 else False)
            ),
            pop_top_50_pct=lambda df: (
                df.percentile
                .apply(lambda x: True if x >= 50 else False)
            ),
        )
        .set_index(['year', 'tract'])
    )
version_4.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,county,population,score,percentile,pop_top_25_pct,pop_top_50_pct
year,tract,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01,6073021800,San Diego,2403,2.936723,0.655572,False,False
2021-01-01,6073017306,San Diego,2439,2.885369,0.605144,False,False
2021-01-01,6073008311,San Diego,2936,1.67119,0.126072,False,False


In [250]:
version_3 = (
        pd.read_csv(
            CONFIG['raw_dir']
            / (
                'calenviroscreen/'
                '3/'
                'calenviroscreen-3.0-results-june-2018-update.csv'
            )
        )
        .assign(
            tract=lambda df: df['Census Tract'],
            county=lambda df: df['California County'],
            population=lambda df: df['Total Population'],
            score=lambda df: df['CES 3.0 Score'],
            percentile=lambda df: df[' CES 3.0 Percentile'],
            year=datetime(2018, 1, 1)
        )
        .dropna(subset=['score'])
        .dropna(subset=['percentile'])
        [
            [
                'tract',
                'county',
                'population',
                'score',
                'percentile',
                'year',
            ]
        ]
        .query("`county` == 'San Diego'")
        .assign(
            pop_top_25_pct=lambda df: (
                df.percentile
                .apply(lambda x: True if x >= 75 else False)
            ),
            pop_top_50_pct=lambda df: (
                df.percentile
                .apply(lambda x: True if x >= 50 else False)
            ),
        )
        .set_index(['year', 'tract'])
    )
version_3.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,county,population,score,percentile,pop_top_25_pct,pop_top_50_pct
year,tract,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01,6073008324,San Diego,6600,2.11,0.37,False,False
2018-01-01,6073008331,San Diego,2466,1.72,0.19,False,False
2018-01-01,6073008102,San Diego,3395,1.09,0.04,False,False


In [251]:
def parse_percentile_range(df: pd.DataFrame) -> pd.Series:
    """Average a range of EnviroScreen percentiles.
    """
    low = df.percentile.apply(lambda x: x.split('-')[0] if type(x) is str else x).astype('Int64')
    high = df.percentile.apply(lambda x: x.split('-')[1].strip('% (lowest/highest scores)')  if type(x) is str else '0').astype('Int64')
    return (low + high) / 2

In [252]:
version_2 = (
    gpd.read_file(
        CONFIG['raw_dir']
        / (
            'calenviroscreen/'
            '2/'
            'CES20_UpdateOct2014.gdb'
        )
    )
    [
        [
            'Tract_1',
            'County',
            'Population',
            'CESScore',
            'PercentileRange',
        ]

    ]
    .rename(
        columns={
            'Tract_1': 'tract',
            'County': 'county',
            'Population': 'population',
            'CESScore': 'score',
            'PercentileRange': 'percentile',
        }
    )
    .assign( # I didn't see percentiles, only a range, so average was taken
        percentile=parse_percentile_range,
    )
    .query("`county` == 'San Diego'")
    .assign(year=datetime(2014, 1, 1))
    .assign(
        pop_top_25_pct=lambda df: (
            df.percentile
            .apply(lambda x: True if not pd.isna(x) and x >= 75 else False)
        ),
        pop_top_50_pct=lambda df: (
            df.percentile
            .apply(lambda x: True if  not pd.isna(x) and x >= 50 else False)
        ),
    )
    .set_index(['year', 'tract'])
)
version_2.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,county,population,score,percentile,pop_top_25_pct,pop_top_50_pct
year,tract,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-01,6073022000.0,San Diego,2022,11.029751,13.0,False,False
2014-01-01,6073022000.0,San Diego,3391,17.486004,33.0,False,False
2014-01-01,6073021000.0,San Diego,7225,12.175017,18.0,False,False


In [253]:
df = pd.concat([version_4, version_3, version_2])
df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,county,population,score,percentile,pop_top_25_pct,pop_top_50_pct
year,tract,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-01,6073022000.0,San Diego,2022,11.029751,13.0,False,False
2014-01-01,6073022000.0,San Diego,3391,17.486004,33.0,False,False
2014-01-01,6073021000.0,San Diego,7225,12.175017,18.0,False,False


In [254]:
region_score = (
    df
    .dropna(subset='score')
    .groupby('year')
    .apply(
        lambda x:
        np.average(
            x['score'],
            weights=x['population'],
        )
    )
    .sort_index()
)

In [255]:
pop_top_25_pct = (
    df
    .query('`pop_top_25_pct` == True')
    .groupby('year')
    ['population']
    .sum()
    .sort_index()
)

In [256]:
pop_top_50_pct = (
    df
    .query('`pop_top_50_pct` == True')
    .groupby('year')
    ['population']
    .sum()
    .sort_index()
)

In [257]:
population = (
    df
    .groupby('year')
    ['population']
    .sum()
)

In [258]:
enviroscreen_score = (
    pd.DataFrame(
        data={
            'region_score': region_score,
            'population_25th': (pop_top_25_pct/population),
            'population_50th': (pop_top_50_pct/population),
        }
    )
    .round(2)
)
enviroscreen_score

Unnamed: 0_level_0,region_score,population_25th,population_50th
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-01,18.41,0.04,0.23
2018-01-01,18.98,0.05,0.23
2021-01-01,19.96,0.07,0.29


### Save Data

In [259]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/qol/social_equity/enviroscreen_score](C:/Users/tan/src/regional-pm-2023/data/clean/qol/social_equity/enviroscreen_score)

In [260]:
enviroscreen_score.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)