In [46]:
%reload_ext autoreload
%autoreload 2

from IPython.core.display import Markdown
from tqdm.auto import tqdm
import config
import docs
import pandas as pd
import requests

pd.set_option('display.max_colwidth', None)

INDICATOR = 'beach_widths'
CONFIG = config.get_config(INDICATOR, '../config.toml')

display(Markdown('## Raw data path'))
raw_dir_path = str(CONFIG['raw_dir']).replace('\\', '/')
display(Markdown(f"[{raw_dir_path}]({raw_dir_path})"))


## Raw data path

[C:/Users/tan/src/regional-pm-2023/data/raw/environment/beaches/beach_widths](C:/Users/tan/src/regional-pm-2023/data/raw/environment/beaches/beach_widths)

# Healthy Environment: Beaches

## Beach Widths

In [47]:
docs.describe_indicator(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Average fall widths of beaches.

nan

In [48]:
docs.list_schema(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,description,type
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
year,Year,Year of record.,datetime
cell,Littoral Cell,Littoral Cell for record.,string
beach,Beach,Beach for record.,string
width,Width,"Average fall widths of beaches in feet, in a given year for a given beach.",float


In [49]:
docs.list_sources(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0_level_0,name,organization,active,notes
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
shoreline_monitoring,Shoreline Monitoring Program,SANDAG Shoreline Monitoring Program,True,Appendixes contain new as well as all historical data.


In [50]:
steps = docs.list_update_steps(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)
steps

Unnamed: 0,step
0,Extract legacy data from legacy PM sheet.
1,Download Report data from new integrated report.
2,Extract/calculate new widths in San Diego region from report.


In [51]:
docs.list_remarks(
    indicator=INDICATOR,
    indicators_xlsx_path=CONFIG['indicators_xlsx_path'],
)

Unnamed: 0,author,note
0,TAN,Current SMEs are Keith Greer <Keith.Greer@sandag.org> and Courtney Pesce <Courtney.Pesce@sandag.org>


### Step 0:

In [52]:
display(steps.loc[0])

old_data = (
    # Read data from historical sheet
    pd.read_excel(
        CONFIG['legacy_xlsx_path'],
        sheet_name=CONFIG['legacy_sheet'],
        skiprows=4,
        nrows=12,
        usecols='A:R',
        header=None,
        names=(
            ['cell', 'beach'] + list(range(2005, 2021))
        )
    )
    # Fill in gaps from Excel's merged cell.
    .assign(cell=lambda df: df.cell.ffill())
    # Melt the year columns into a tall table.
    .melt(
        id_vars=['cell', 'beach'], 
        var_name='year',
        value_name='width',
    )
    # Format year to datetime
    .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    [['year', 'cell', 'beach', 'width']]
    .set_index(['year', 'cell', 'beach'])
)
old_data.tail(2)

step    Extract legacy data from legacy PM sheet.
Name: 0, dtype: object

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,width
year,cell,beach,Unnamed: 3_level_1
2020-01-01,Oceanside Littoral Cell,Carlsbad,119.9
2020-01-01,Oceanside Littoral Cell,Oceanside,248.0


### Step 1: Download new data

In [53]:
display(steps.loc[1])

step    Download Report data from new integrated report.
Name: 1, dtype: object

The newest latest Shoreline Monitoring reports can be found [here](https://www.sandag.org/projects-and-programs/environment/shoreline-management/monitoring-program).

* Appendix C contains the newest width data (as well as all previous data).
* `beach_transects.xlsx` is a manually made mapping of individual transects (sensor stations) to beaches in the report. Some reverse engineering had to be done because we were not able to find the historical mappings, but the numbers are very close (0-1 foot off for all beaches).

### Step 2:

In [54]:
display(steps.loc[2])

step    Extract/calculate new widths in San Diego region from report.
Name: 2, dtype: object

In [55]:
# Mappings are needed to aggregate transect stations by beach.
beach_transects = pd.read_excel(CONFIG['raw_dir']/'beach_transects.xlsx')
beach_transects.head(2)


def calculate_beach_widths(
    year: int,
    column: str,
    beach_transects: pd.DataFrame,
) -> pd.DataFrame:
    """Calculate average fall beach widths for a single year.
    """
    widths = (
        pd.read_excel(
            CONFIG['raw_dir']/'Appendix C-MSL Shorelines.xlsx',
            sheet_name='MSL Beachwidths',
            skiprows=6,
            header=None,
            usecols=f'A,B,{column}',
            names=['cell', 'transect', 'width']
        )
        # transect code is first 7 characters.
        .assign(transect=lambda df: df.transect.str[0:7])
        # Fill in gaps from Excel's merged cell.
        .assign(cell=lambda df: df.cell.ffill())
        .assign(year=year)
        .assign(year=lambda df: pd.to_datetime(df.year, format='%Y'))
    )
    return (
        widths
        .merge(beach_transects, how='left', on='transect')
        .drop(columns='transect')
        .groupby(by=['year', 'cell', 'beach'], sort=False)
        [['width']]
        .mean()
        .round(1)
    )

Calculate 2020 data and compare to data on the PM XLSX.

They are slightly off, but nothing larger than a foot. This is the closest I've been able to match past values given that we were unable to locate the original transect mapping.

In [56]:
beach_widths_2020 = calculate_beach_widths(
    year=2020,
    column='BE',
    beach_transects=beach_transects,
)


old_data.loc['2020', 'width'] - beach_widths_2020.width

  old_data.loc['2020', 'width'] - beach_widths_2020.width


year        cell                         beach                    
2020-01-01  Silver Strand Littoral Cell  Imperial Beach               0.0
                                         Silver Strand State Beach    0.5
                                         Coronado                     0.0
            Mission Beach Littoral Cell  Ocean Beach                  0.0
                                         Pacific/ Mission Beaches     0.0
            Oceanside Littoral Cell      La Jolla                     0.2
                                         San Diego                    0.0
                                         Del Mar                      0.5
                                         Solana Beach                 0.0
                                         Encinitas                    0.0
                                         Carlsbad                     0.9
                                         Oceanside                    0.0
Name: width, dtype: float64

In [57]:
new_data = calculate_beach_widths(
    year=2021,
    column='BG',
    beach_transects=beach_transects,
)


In [58]:
beach_widths = pd.concat(
    [
        old_data,
        new_data,
    ],
)
beach_widths

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,width
year,cell,beach,Unnamed: 3_level_1
2005-01-01,Silver Strand Littoral Cell,Imperial Beach,114.50
2005-01-01,Silver Strand Littoral Cell,Silver Strand State Beach,438.50
2005-01-01,Silver Strand Littoral Cell,Coronado,737.00
2005-01-01,Mission Beach Littoral Cell,Ocean Beach,225.00
2005-01-01,Mission Beach Littoral Cell,Pacific/ Mission Beaches,240.75
...,...,...,...
2021-01-01,Oceanside Littoral Cell,Del Mar,143.50
2021-01-01,Oceanside Littoral Cell,Solana Beach,204.00
2021-01-01,Oceanside Littoral Cell,Encinitas,135.80
2021-01-01,Oceanside Littoral Cell,Carlsbad,114.90


### Save Data

In [59]:
display(Markdown('#### Clean data path'))
clean_dir_path = str(CONFIG['clean_dir']).replace('\\', '/')
display(Markdown(f"[{clean_dir_path}]({clean_dir_path})"))

#### Clean data path

[C:/Users/tan/src/regional-pm-2023/data/clean/environment/beaches/beach_widths](C:/Users/tan/src/regional-pm-2023/data/clean/environment/beaches/beach_widths)

In [60]:
beach_widths.to_csv(
    CONFIG['clean_dir']
    / f'{INDICATOR}_odp.csv'
)