## Download latest WLs (real-time data from CHS API)

Function of script:
* Download real-time data between the start_yr and end_yr specified

Context in workflow:
* This script can be used in two ways 
    * to compare the historical and real-time datasets. For this, use an older start date (e.g., 2000).
    * to add recent years without overlap. For this, use a more recent start date (e.g., 2023).
* If you only want to process historical data, you can skip this step and the merge step and go directly from metadata to preproc, as long as you update the paths in the preproc step.

Notes: 
* I've already checked that the real-time stations match the historical stations in terms of timezone and datum (not shown).
* Some stations don't have real-time data. For these, no file is downloaded.
* Currently have a skip for Fulford Harbour (07330), which does not have real-time data.
* API docs:
    * https://tides.gc.ca/tides/node/215
    * https://api-iwls.dfo-mpo.gc.ca/swagger-ui/index.html

In [4]:
import os
import datetime as dt
import time
import json
import urllib.request
import pandas as pd
from pathlib import Path

start_yr = 2023
end_yr = 2025

DATA = Path("data")

stnlist_csv = DATA / "inputs" / "metadata.csv"
output_dir = DATA / "outputs" / "wl_realtime"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
def sid_from_code(code: str):
    """Return the station ID from the station code.
    
    Parameters
    ----------
    code : str
        Station code, e.g., "08860"
    """
    url = f"https://api-iwls.dfo-mpo.gc.ca/api/v1/stations?code={code}"
    with urllib.request.urlopen(url) as response:
        data = json.loads(response.read().decode())
    return data[0]['id']

def real_time_data(code: str, start: dt.datetime, end: dt.datetime) -> pd.DataFrame:
    """Return hourly sea level time series from station data from the 
    Integrated Water Level System of the Canadian Hydrographic Service.

    Parameters
    ----------
    sid : str
      Station code, e.g. "00490" for the Halifax station.
    start : dt.datetime
      Start date. 
    end : dt.datetime.
      End date.

    Returns
    -------
    pd.DataFrame
      Hourly water level time series.
    
    Notes
    -----
    https://api.iwls-sine.azure.cloud-nuage.dfo-mpo.gc.ca/swagger-ui/index.html
    """
    api = "https://api-iwls.dfo-mpo.gc.ca/api/v1"
    sid = sid_from_code(code)
    s = start
    raw = []
    while s < end:
        # The API might grumble for long requests, so limit to one month at a time.
        e = min(s + pd.DateOffset(months=1), end)
        
        start_str = s.strftime('%Y-%m-%dT%H:%M:%SZ')
        end_str = e.strftime('%Y-%m-%dT%H:%M:%SZ')
        
        url = f"{api}/stations/{sid}/data?time-series-code=wlo&from={start_str}&to={end_str}&resolution=SIXTY_MINUTES"

        try:
            with urllib.request.urlopen(url) as response:
                data = json.loads(response.read().decode())
                raw.extend(data)
                time.sleep(1) 
        except Exception as err:
            print(f"Error fetching data: {err}\n{url}")
        
        s = e
    
    if raw:
        # Convert raw data to DataFrame    
        df = pd.DataFrame(raw)
        df['eventDate'] = pd.to_datetime(df['eventDate'])
        df = df.drop(columns=['timeSeriesId'])
        return df

In [None]:
stnlist = pd.read_csv(stnlist_csv, encoding='latin1')

start = pd.to_datetime(str(start_yr) + '-01-01')
end = pd.to_datetime(str(end_yr) + '-12-31')

for i, row in stnlist.iterrows():
    stn_num = str(row['stn_num']).zfill(5)
    stn_name = row['stn_name']
    print(stn_num, stn_name)
    
    # Fulford Harbour (07330) does not have real-time data.
    if stn_num == '07330':
        continue
    
    df = real_time_data(stn_num, start, end)
    df.to_csv(os.path.join(output_dir, f'{stn_num}_realtime_wl.csv'), index=False)