## Timezone & datum conversion

Function of script:
* Processing that isn't specifically related to the project (other processing that needs practitioner input, such as removing years with too little data, is in prep_ts step).
* Convert to UTC and CGVD2013

Context in workflow:
* Must be run after merge and before proc

In [None]:
import os
import re

import pandas as pd
from dask import compute, delayed
import pytz
from pathlib import Path

dec = 3

DATA = Path("data")
INPUTS = DATA / "inputs"
OUTPUTS = DATA / "outputs"

stnlist_csv = INPUTS / "metadata.csv"
input_dir = OUTPUTS / "wl_merged"
output_dir = OUTPUTS / "wl_preproc"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

stnlist = pd.read_csv(stnlist_csv, encoding='latin1')

def get_tz_from_str(tz_str):
    """Converts a timezone string (e.g., 'UTC-4' or 'UTC-3:30') to a pytz timezone object."""
    if tz_str.startswith("UTC"):
        offset = tz_str[3:]
        hours, minutes = map(int, offset.split(':')) if ':' in offset else (int(offset), 0)
        return pytz.FixedOffset(hours * 60 - minutes)
    return None

@delayed
def convert_file(filename):
    filepath_in = os.path.join(input_dir, filename)
    filepath_out = os.path.join(output_dir, filename.replace(".DAT", ".csv"))

    if not os.path.exists(filepath_in):
        raise FileNotFoundError(f"The file '{filepath_in}' does not exist.")
    
    df = pd.read_csv(filepath_in, skiprows=28, sep=r'\s+', names=['date', 'time', 'wl'],
                        dtype={'wl': 'float64'},
                        encoding='latin-1')
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y/%m/%d %H:%M', errors='coerce')
    df = df.dropna()

    ## Step 1 - Convert to UTC
    stn_num = int(re.search(r'(\d{5})_', filename).group(1))
    tz_str = stnlist[stnlist['stn_num'] == stn_num]['lcl_stnd_tz'].values[0]
    tz = get_tz_from_str(tz_str)
    df['DateTime_utc'] = df['datetime'].apply(lambda x: tz.localize(x).astimezone(pytz.utc))
    
    ## Step 2 - Convert to CGVD2013
    # Note that because the 'HyVSEP CD (m CGVD2013)' values are negative, this is actually a substaction
    cd_cgvd2013_factor = stnlist[stnlist['stn_num'] == stn_num]['cd_in_cg13'].values[0]
    df['wl_CGVD2013'] = df.apply(lambda row: round(row['wl'] + cd_cgvd2013_factor, dec) if row['wl'] != 999.999 else 999.999, axis=1)

    df.drop(columns = ['datetime', 'time', 'date', 'wl'], inplace= True)
    df.to_csv(filepath_out, index = False)

stations_todo = [f'{str(x).zfill(5)}' for x in stnlist['stn_num'].values]
print(f'station_todo: {len(stations_todo)}')
filenames = [f'{station}_HOURLY.DAT' for station in stations_todo]
delayed_results = [convert_file(filename) for filename in filenames]
computed_results = compute(*delayed_results)