In [15]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests

pd.set_option('display.max_colwidth', None)

INDICATOR = 'impaired_waterbodies'
CONFIG = config.get_config(INDICATOR, '../config.toml')

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/environment/water_quality/impaired_waterbodies](C:/Users/tan/src/regional-pm-2023/data/raw/environment/water_quality/impaired_waterbodies)

# Healthy Environment: Water Quality

## Impaired Waterbodies

In [16]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Area and length of impaired (polluted) waterbodies.

nan

In [17]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Impaired Waterbodies,Year,Year of record.,datetime
miles_impaired,Miles Impaired,"Length in miles of linear waterbodies (rivers, coastlines, etc.) impaired (polluted) waterbodies, in a given year.",float
acres_impaired,Acres Impaired,"Area in acres of non-linear waterbodies (lakes, bays, etc.) of impaired (polluted) waterbodies, in a given year.",float


In [18]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
integrated_reports,CWB Integrated Report,California Water Resources Control Board,True,Reported every few years.


In [19]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Extract legacy data from legacy PM sheet.
1,Download Appendix data from new integrated report.
2,Extract/calculate new widths and areas in San Diego region from report.


In [20]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,Current SME is Keith Greer <Keith.Greer@sandag.org>.


### Step 0: Extract legacy data

In [21]:
display(steps.loc[0])

old_data = (
    pd.read_excel(
        CONFIG['legacy_xlsx_path'],
        sheet_name=CONFIG['legacy_sheet'],
        skiprows=4,
        nrows=4,
        usecols='A:C',
        header=None,
        names=(['year', 'miles_impaired', 'acres_impaired'])
    )
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    .set_index('year')
    .round(1)
)
old_data

Unnamed: 0_level_0,miles_impaired,acres_impaired
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-01-01,259.0,21282.0
2007-01-01,493.0,21578.0
2010-01-01,708.0,22758.0
2018-01-01,2650.7,21417.2


### Step 1: Download new data

In [29]:
display(steps.loc[1])

step    Download Appendix data from new integrated report.
Name: 1, dtype: object

The newest 2022 integrated report can be downloaded [here](https://www.waterboards.ca.gov/water_issues/programs/water_quality_assessment/2020_2022_integrated_report.html). Note that the past and future reports will all have inconsistent URL paths.

* Appendix A contains new 2022 data.
* Appendix H contains 2018 data from previous report (useful for checking calculation).

### Step 2: Extract new data and calculate new measures

In [28]:
display(steps.loc[2])

step    Extract/calculate new widths and areas in San Diego region from report.
Name: 2, dtype: object

In [22]:
def calculate_impaired_waterbodies(raw_data: pd.DataFrame) -> pd.DataFrame:
    return (
        raw_data
        .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
        .assign(county=lambda df: df.county.str.split(', '))
        .explode(column='county')
        .assign(county=lambda df: df.county.str.strip())
        .query("`county` == 'San Diego'")
        .query("`category` in [5, '4a', '4b']")
        .drop_duplicates('waterbody_id')
        .groupby(by=['year', 'unit']).size_affected.sum()
        .reset_index()
        .pivot(index='year', columns='unit', values='size_affected')
        .rename(
            columns={
                'miles': 'miles_impaired',
                'acres': 'acres_impaired',
            }
        )
        .rename_axis(None, axis=1)
        .round(1)
    )

In [23]:
test_data = (
    pd.read_excel(
        CONFIG['raw_dir']/'apx-h-2018-303d-list.xlsx',
        sheet_name='Proposed FInal 303(d) List',
        skiprows=3,
        usecols='D,F,G,H,L',
        header=None,
        names=(['waterbody_id', 'category', 'size_affected', 'unit', 'county'])
    )
    .assign(year=2018)
)
test_data = calculate_impaired_waterbodies(test_data)
test_data

Unnamed: 0_level_0,acres_impaired,miles_impaired
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,21417.2,2677.2


In [24]:
new_data = (
    pd.read_excel(
        CONFIG['raw_dir']/'apx-a-303d-list.xlsx',
        sheet_name='303(d)List',
        skiprows=3,
        usecols='D,F,G,H,L',
        header=None,
        names=(['waterbody_id', 'category', 'size_affected', 'unit', 'county'])
    )
    .assign(year=2022)
)
new_data = calculate_impaired_waterbodies(new_data)
new_data

Unnamed: 0_level_0,acres_impaired,miles_impaired
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-01,23696.4,864.5


In [25]:
impaired_waterbodies = pd.concat(
    [
        old_data,
        new_data,
    ],
)
impaired_waterbodies

Unnamed: 0_level_0,miles_impaired,acres_impaired
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-01-01,259.0,21282.0
2007-01-01,493.0,21578.0
2010-01-01,708.0,22758.0
2018-01-01,2650.7,21417.2
2022-01-01,864.5,23696.4


### Save Data

In [26]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/environment/water_quality/impaired_waterbodies](C:/Users/tan/src/regional-pm-2023/data/clean/environment/water_quality/impaired_waterbodies)

In [27]:
impaired_waterbodies.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)