# Streamflow Data Download and Preprocessing

Metadata for streamflow to be downloaded from: https://wateroffice.ec.gc.ca/station_metadata/station_characteristics_e.html using the specifications `Province = Alberta`, `Parameter Type = Flows`, and `Regulation = Natural` and saved as `station_metadata.csv`.

The stations listed in the metadata file subject to the date specifications are downloaded below from HYDAT and saved to `combined_streamflow.csv`.

In [38]:
import urllib.parse
import pandas as pd

batch_size = 50

# define start and end years
start_year = 1980
end_year = 2022

metadata = pd.read_csv("station_metadata.csv")

def build_wateroffice_url(stations, start_date, end_date, parameter="flow"):
    """
    Build a Wateroffice batch-download URL for daily data.
    
    Parameters
    ----------
    stations : list of str
        List of hydrometric station numbers (e.g. ["11AB104", "11AB105"]).
    start_date : str
        Start date in YYYY-MM-DD.
    end_date : str
        End date in YYYY-MM-DD.
    parameter : str
        Usually "flow" for discharge.
    
    Returns
    -------
    str
        A full URL that can be used to download a combined CSV of all stations.
    """

    base = "https://wateroffice.ec.gc.ca/services/daily_data/csv/inline?"

    # Encode station parameters properly
    # stations[]=11AB104&stations[]=11AB105&...
    station_params = "&".join([f"stations[]={urllib.parse.quote(s)}" for s in stations])

    # Encode parameter (e.g. flow)
    param_part = f"parameters[]={urllib.parse.quote(parameter)}"

    # Add date range
    date_part = f"start_date={start_date}&end_date={end_date}"

    url = base + station_params + "&" + param_part + "&" + date_part
    return url

def download_wateroffice_data(stations, start_date, end_date, parameter="flow"):
    url = build_wateroffice_url(stations, start_date, end_date, parameter)
    return pd.read_csv(url)

# filter the stations based on given year range
filtered_metadata = metadata[(metadata['Year From'] <= start_year) & (metadata['Year To'] >= end_year)]
study_stations = filtered_metadata["Station Number"].tolist()

all_data = []

for i in range(0, len(study_stations), batch_size):
    batch_stations = study_stations[i:i + batch_size]
    
    df_batch = download_wateroffice_data(
        batch_stations,
        start_date=f"{start_year}-01-01",
        end_date=f"{end_year}-12-31"
    )

    all_data.append(df_batch[[" ID", "Date", "Value/Valeur"]])
    
    # # Save each batch to a separate CSV file
    # batch_number = i // batch_size + 1
    # df_batch.to_csv(f"raw_streamflow_batch_data/daily_data ({batch_number}).csv", index=False)

# Combine all files into a single long-format DataFrame
df_long = pd.concat(all_data, ignore_index=True)

# Convert Date to datetime to ensure proper sorting
df_long["Date"] = pd.to_datetime(df_long["Date"])

# Pivot into wide format: rows = dates, columns = station IDs
df_wide = df_long.pivot(index="Date", columns=" ID", values="Value/Valeur")

# Sort rows (by Date) and columns (station IDs)
df_wide = df_wide.sort_index().sort_index(axis=1)

df_wide.to_csv("combined_streamflow.csv")

print("Data downloaded and saved to combined_streamflow.csv")
print(f"{df_wide.shape[0]} days of data saved for {df_wide.shape[1]-1} stations")

Data downloaded and saved to combined_streamflow.csv
15706 days of data saved for 179 stations
