Script to calculate return periods of the observed water level data. Can either use the complete time series or the 2003-2023 period (i.e., GloFAS analysis period). 

In [1]:
# import relevant packages
import pandas as pd
import numpy as np
from scipy.stats import genextreme
import os

In [5]:
# define country and directory
country = 'zimbabwe'
directory = '/s3/scratch/jamie.towner/flood_aa'   
output_directory = os.path.join(directory, country, "outputs/thresholds")
os.makedirs(output_directory, exist_ok=True)  # create directory if it does not already exist 

In [6]:
# define paths to data
observed_data_directory = os.path.join(directory, country, "data/observations/gauging_stations/all_stations")
observed_data_file = "observations.csv" # use observations.csv for 2003-2023 period

# load data
observed_data_path = os.path.join(observed_data_directory, observed_data_file)
observed_data = pd.read_csv(observed_data_path)

In [7]:
# convert date columns to datetime
observed_data["date"] = pd.to_datetime(observed_data["date"], format='mixed')

In [8]:
# check data
observed_data

Unnamed: 0,date,ruware,malapati,mutirikwi,makwe,runde,beitbridge,manyuchi,kwalu,bangala,ingwesi,tokwane,tokwe
0,2003-01-01,1.21,,0.20,0.0,0.75,0.0,0.1,,256.02,0.0,1.28,0.53
1,2003-01-02,1.23,,0.18,0.0,1.02,0.0,0.1,,235.23,0.0,1.10,0.39
2,2003-01-03,1.17,,0.18,0.0,1.01,0.0,0.1,,231.20,0.0,0.98,0.71
3,2003-01-04,1.08,,0.17,0.0,1.00,0.0,0.1,,229.51,0.0,1.00,1.53
4,2003-01-05,1.09,,0.16,0.0,1.00,0.0,0.1,,227.82,0.0,1.03,1.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7665,2023-12-27,,,,,,,,,,,,
7666,2023-12-28,,,,,,,,,,,,
7667,2023-12-29,,,,,,,,,,,,
7668,2023-12-30,,,,,,,,,,,,


In [10]:
# function to calculate return periods for a given station's data
def calculate_return_periods(station_data, years=[2, 5, 10, 20]):
    # drop NA values
    station_data = station_data.dropna().copy()

    # extract the year from the date column 
    station_data.loc[:, 'year'] = station_data['date'].dt.year
    
    # group by year and get the maximum value for each year 
    annual_max = station_data.groupby('year')[station_data.columns[1]].max()
    
    # fit the data to a GEV distribution (Generalized Extreme Value distribution)
    #params = genextreme.fit(annual_max)

    # calculate the return period for each year 
    return_periods = {}
    for return_year in years:
        # the formula for return period is: 1 / (1 - F(x))
        # F(x) is the CDF of the fitted distribution at the threshold (max value)
        threshold = np.percentile(annual_max, 100 * (1 - 1/return_year))
        #threshold = genextreme.ppf(1 - 1/return_year, *params)
        return_periods[return_year] = threshold

    return return_periods

# initialize a dictionary to store return periods for each station
return_periods_dict = {}

# iterate over each station in the observed_data 
for station in observed_data.columns:
    if station == 'date':
        continue  # Skip 'date' column

    # get the data for this station
    station_data = observed_data[['date', station]]

    # skip if all values are NaN
    if station_data[station].dropna().empty:
        continue

    # calculate return periods for the station
    return_periods = calculate_return_periods(station_data)
    
    # store the return periods in the dictionary
    return_periods_dict[station] = return_periods

# convert the dictionary to a dataframe
return_periods_df = pd.DataFrame.from_dict(return_periods_dict, orient='index')

In [11]:
# check the output
return_periods_df

Unnamed: 0,2,5,10,20
ruware,26.41,87.808,133.668,163.292
mutirikwi,50.97,108.25,158.41,232.39
makwe,12.02,18.03,29.98,34.8725
runde,169.1,481.672,555.208,589.9605
beitbridge,951.39,2425.424,2755.428,2993.632
manyuchi,0.74,101.974,112.482,130.398
bangala,346.09,473.762,490.428,548.799
ingwesi,0.0,0.0,2.535,7.82
tokwane,121.355,463.98,752.12,992.6525
tokwe,802.91,1496.578,1759.359,1968.7915


In [None]:
# save output as a csv 
return_periods_df.to_csv(os.path.join(output_directory, "return_periods.csv"), index=False)