Script to calculate return periods of the observed water level data. Can either use the complete time series or the 2003-2023 period (i.e., GloFAS analysis period). 

In [47]:
# import relevant packages
import pandas as pd
import numpy as np
from scipy.stats import genextreme
import os

In [48]:
# define country and directory
country = 'zimbabwe'
directory = '/s3/scratch/jamie.towner/flood_aa'   
output_directory = os.path.join(directory, country, "outputs/thresholds")
os.makedirs(output_directory, exist_ok=True)  # create directory if it does not already exist 

In [49]:
# define paths to data
observed_data_directory = os.path.join(directory, country, "data/observations/gauging_stations/all_stations")
observed_data_file = "observations_complete_series.csv" # use observations.csv for 2003-2023 period

# load data
observed_data_path = os.path.join(observed_data_directory, observed_data_file)
observed_data = pd.read_csv(observed_data_path)

In [50]:
# convert date columns to datetime
observed_data["date"] = pd.to_datetime(observed_data["date"], format='mixed')

In [51]:
# check data
observed_data

Unnamed: 0,date,beitbridge,chisurgwe,chitsuwa,condo,jackquinton,katiyo,manyuchi,mazowe,mutirikwi,pungwe,runde,tokwe,ypres
0,1959-10-01,0.0,,,,,,,,,,,,
1,1959-10-02,0.0,,,,,,,,,,,,
2,1959-10-03,0.0,,,,,,,,,,,,
3,1959-10-04,0.0,,,,,,,,,,,,
4,1959-10-05,0.0,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23463,2023-12-27,,,,,,,,,,,,,
23464,2023-12-28,,,,,,,,,,,,,
23465,2023-12-29,,,,,,,,,,,,,
23466,2023-12-30,,,,,,,,,,,,,


In [52]:
# function to calculate return periods for a given station's data
def calculate_return_periods(station_data, years=[2, 5, 10, 20]):
    # drop NA values
    station_data = station_data.dropna().copy()

    # extract the year from the date column 
    station_data.loc[:, 'year'] = station_data['date'].dt.year
    
    # group by year and get the maximum value for each year 
    annual_max = station_data.groupby('year')[station_data.columns[1]].max()
    
    # fit the data to a GEV distribution (Generalized Extreme Value distribution)
    #params = genextreme.fit(annual_max)

    # calculate the return period for each year 
    return_periods = {}
    for return_year in years:
        # the formula for return period is: 1 / (1 - F(x))
        # F(x) is the CDF of the fitted distribution at the threshold (max value)
        threshold = np.percentile(annual_max, 100 * (1 - 1/return_year))
        #threshold = genextreme.ppf(1 - 1/return_year, *params)
        return_periods[return_year] = threshold

    return return_periods

# initialize a dictionary to store return periods for each station
return_periods_dict = {}

# iterate over each station in the observed_data 
for station in observed_data.columns:
    if station == 'date':
        continue  # Skip 'date' column

    # get the data for this station
    station_data = observed_data[['date', station]]

    # skip if all values are NaN
    if station_data[station].dropna().empty:
        continue

    # calculate return periods for the station
    return_periods = calculate_return_periods(station_data)
    
    # store the return periods in the dictionary
    return_periods_dict[station] = return_periods

# convert the dictionary to a dataframe
return_periods_df = pd.DataFrame.from_dict(return_periods_dict, orient='index')

In [53]:
# check the output
return_periods_df

Unnamed: 0,2,5,10,20
beitbridge,982.835,2075.21,3128.1605,3370.6505
chisurgwe,183.805,440.184,579.461,586.8235
chitsuwa,10.98,40.236,41.322,41.861
condo,330.48,493.014,815.518,842.1025
jackquinton,335.147,534.7576,856.4278,1315.7286
katiyo,89.6,424.432,545.1,649.527
manyuchi,10.61,115.9672,444.9896,857.1212
mazowe,401.065,506.064,609.664,770.6965
mutirikwi,57.354,111.31,136.8738,290.7805
pungwe,16.55,46.49,55.05,65.325


In [32]:
# save output as a csv 
return_periods_df.to_csv(os.path.join(output_directory, "observed_return_periods_complete_series.csv"), index=True)