Script to calculate the equivalent GloFAS thresholds compared to the observed thresholds. Uses GloFAS reanalysis data from 1979. Uses quantile mapping approach to map values between the observed and reanalysis dataset. 

In [1]:
# import relevant packages
import numpy as np
import pandas as pd
from scipy import stats
import os

In [2]:
# define country and directory
country = 'mozambique'
directory = '/s3/scratch/jamie.towner/flood_aa'

output_directory = os.path.join(directory, country, "outputs/thresholds")
os.makedirs(output_directory, exist_ok=True)  # create directory if it does not already exist 

In [9]:
# define paths to data
metadata_directory = os.path.join(directory, country, "data/metadata")
observed_data_directory = os.path.join(directory, country, "data/observations/gauging_stations/all_stations")
reanalysis_data_directory = os.path.join(directory, country, "data/forecasts/glofas_reanalysis/all_stations")

observed_data_file = "observations_complete_series.csv"
reanalysis_data_file = "glofas_reanalysis_complete_series.csv"
station_info_file = "metadata_observations.csv"

# load data
observed_data_path = os.path.join(observed_data_directory, observed_data_file)
reanalysis_data_path = os.path.join(reanalysis_data_directory, reanalysis_data_file)
station_info_path = os.path.join(metadata_directory, station_info_file)

observed_data = pd.read_csv(observed_data_path)
reanalysis_data = pd.read_csv(reanalysis_data_path)
station_info = pd.read_csv(station_info_path)

In [10]:
# convert date columns to datetime
observed_data["date"] = pd.to_datetime(observed_data["date"], format='mixed')
reanalysis_data["date"] = pd.to_datetime(reanalysis_data["date"], format='mixed')
station_info['obs_bankfull'] = pd.to_numeric(station_info['obs_bankfull'], errors='coerce')
station_info['obs_moderate'] = pd.to_numeric(station_info['obs_moderate'], errors='coerce')
station_info['obs_severe'] = pd.to_numeric(station_info['obs_severe'], errors='coerce')

In [13]:
# Remove leading/trailing whitespace from metadata station names
station_info['station name'] = ["".join(c for c in name if c.isalnum() or c in (' ', '_')).replace(' ', '_') for name in station_info['station name']]

# Remove whitespace from observed and reanalysis data columns
observed_data.columns = observed_data.columns.str.strip()
reanalysis_data.columns = reanalysis_data.columns.str.strip()

In [14]:
# initialize list to store results
results = []

# loop over each station and threshold in metadata
for index, row in station_info.iterrows():
    station = row['station name']
    
    # skip station if any threshold is missing (NaN)
    if pd.isna(row['obs_bankfull']) or pd.isna(row['obs_moderate']) or pd.isna(row['obs_severe']):
        continue
    
    # get observed and reanalysis data for the station
    data_observed = observed_data[station].dropna().values
    data_reanalysis = reanalysis_data[station].dropna().values

    # standardize both datasets (z-score normalization)
    obs_mean, obs_std = np.mean(data_observed), np.std(data_observed)
    reanalysis_mean, reanalysis_std = np.mean(data_reanalysis), np.std(data_reanalysis)

    z_observed = (data_observed - obs_mean) / obs_std
    z_reanalysis = (data_reanalysis - reanalysis_mean) / reanalysis_std

    # define thresholds to loop over
    thresholds = {
        'obs_bankfull': row['obs_bankfull'],
        'obs_moderate': row['obs_moderate'],
        'obs_severe': row['obs_severe']
    }

    # loop over each threshold
    for threshold_name, threshold_value in thresholds.items():
        # convert threshold to z-score in observed data space
        z_threshold = (threshold_value - obs_mean) / obs_std

        # get percentile rank of threshold in observed data
        percentile_rank_observed = stats.percentileofscore(z_observed, z_threshold)

        # ensure percentiles are within valid range
        percentile_rank_observed = max(0, min(percentile_rank_observed, 100))

        # interpolate the corresponding value in reanalysis data
        percentiles = np.linspace(0, 100, len(z_reanalysis))
        z_mapped = np.interp(percentile_rank_observed, percentiles, np.sort(z_reanalysis))

        # convert back to the original reanalysis scale
        value_reanalysis = (z_mapped * reanalysis_std) + reanalysis_mean

        # store results
        results.append({
            'station': station,
            'threshold_name': threshold_name,
            'threshold_value': threshold_value,
            'percentile_rank_observed': percentile_rank_observed,
            'value_reanalysis': value_reanalysis
        })

# convert results to a dataframe and print
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,station,threshold_name,threshold_value,percentile_rank_observed,value_reanalysis
0,Limpopo_em_Mapai,obs_bankfull,4.365,98.092426,985.165442
1,Limpopo_em_Mapai,obs_moderate,5.550667,99.670716,4256.715563
2,Limpopo_em_Mapai,obs_severe,6.377333,99.897809,6128.158339
3,Limpopo__Combomune,obs_bankfull,4.5,94.933185,383.495024
4,Limpopo__Combomune,obs_moderate,7.452,99.653551,4238.880464
5,Limpopo__Combomune,obs_severe,8.418,99.863895,5839.625824
6,Limpopo_em_Chokwe,obs_bankfull,5.0,97.665285,1187.666264
7,Limpopo_em_Chokwe,obs_moderate,7.11,99.594152,4797.012057
8,Limpopo_em_Chokwe,obs_severe,7.674667,99.786166,6356.340579
9,Limpopo_em_Sicacate,obs_bankfull,6.0,91.952828,426.607727


In [15]:
# save output as a csv 
results_df.to_csv(os.path.join(output_directory, "glofas_return_periods_complete_series.csv"), index=True)

In [16]:
results_df.pivot_table(index='station',columns='threshold_name',values='value_reanalysis')

threshold_name,obs_bankfull,obs_moderate,obs_severe
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Changane_em_Chibuto,5.40625,20.687838,23.666365
Chire_em_Vila_Bocage,2123.49138,2681.555734,3199.36728
Limpopo__Combomune,383.495024,4238.880464,5839.625824
Limpopo_em_Chokwe,1187.666264,4797.012057,6356.340579
Limpopo_em_Mabalane,668.175784,3046.140923,4815.956713
Limpopo_em_Macaretane,1333.682824,4436.61299,5299.260088
Limpopo_em_Mapai,985.165442,4256.715563,6128.158339
Limpopo_em_Sicacate,426.607727,2594.874032,3953.142422
LuenhaLuenha_I,421.015142,1679.793081,2253.061178
Revubue_em_Chingodzi,885.475767,1038.767701,1615.997398
