Script to calculate return periods of the observed water level data. Can either use the complete time series or the 2003-2023 period (i.e., GloFAS analysis period). 

In [1]:
# import relevant packages
import pandas as pd
import numpy as np
from scipy.stats import genextreme
import os

In [2]:
# define country and directory
country = 'mozambique'
directory = '/s3/scratch/jamie.towner/flood_aa'   
output_directory = os.path.join(directory, country, "outputs/thresholds")
os.makedirs(output_directory, exist_ok=True)  # create directory if it does not already exist 

In [3]:
# define paths to data
observed_data_directory = os.path.join(directory, country, "data/observations/gauging_stations/all_stations")
observed_data_file = "observations_complete_series.csv" # use observations.csv for 2003-2023 period

# load data
observed_data_path = os.path.join(observed_data_directory, observed_data_file)
observed_data = pd.read_csv(observed_data_path)

In [4]:
# convert date columns to datetime
observed_data["date"] = pd.to_datetime(observed_data["date"], format='mixed')

In [5]:
# check data
observed_data

Unnamed: 0,date,gurue,goonda,messalo,revue,franca,mocuba,nairoto,massangena,espungabera,dombe
0,1943-01-01,,,,,,0.416667,,,,
1,1943-02-01,,,,,,0.446667,,,,
2,1943-03-01,,,,,,0.500000,,,,
3,1943-04-01,,,,,,0.500000,,,,
4,1943-05-01,,,,,,0.533333,,,,
...,...,...,...,...,...,...,...,...,...,...,...
29536,2023-11-13,2.106667,2.153333,,,,4.060000,,,,1.146667
29537,2023-11-14,2.133333,2.160000,,,,4.000000,,,,1.133333
29538,2023-11-15,2.116667,2.180000,,,,4.243333,,,,1.140000
29539,2023-11-16,2.186667,,,,,4.130000,,,,


In [6]:
# function to calculate return periods for a given station's data
def calculate_return_periods(station_data, years=[2, 5, 10, 20]):
    # drop NA values
    station_data = station_data.dropna().copy()

    # extract the year from the date column 
    station_data.loc[:, 'year'] = station_data['date'].dt.year
    
    # group by year and get the maximum value for each year 
    annual_max = station_data.groupby('year')[station_data.columns[1]].max()
    
    # fit the data to a GEV distribution (Generalized Extreme Value distribution)
    #params = genextreme.fit(annual_max)

    # calculate the return period for each year 
    return_periods = {}
    for return_year in years:
        # the formula for return period is: 1 / (1 - F(x))
        # F(x) is the CDF of the fitted distribution at the threshold (max value)
        threshold = np.percentile(annual_max, 100 * (1 - 1/return_year))
        #threshold = genextreme.ppf(1 - 1/return_year, *params)
        return_periods[return_year] = threshold

    return return_periods

# initialize a dictionary to store return periods for each station
return_periods_dict = {}

# iterate over each station in the observed_data 
for station in observed_data.columns:
    if station == 'date':
        continue  # Skip 'date' column

    # get the data for this station
    station_data = observed_data[['date', station]]

    # skip if all values are NaN
    if station_data[station].dropna().empty:
        continue

    # calculate return periods for the station
    return_periods = calculate_return_periods(station_data)
    
    # store the return periods in the dictionary
    return_periods_dict[station] = return_periods

# convert the dictionary to a dataframe
return_periods_df = pd.DataFrame.from_dict(return_periods_dict, orient='index')

In [7]:
# check the output
return_periods_df

Unnamed: 0,2,5,10,20
gurue,4.066667,4.545333,4.884,4.980667
goonda,5.8,7.196,8.444,9.408
messalo,5.295,5.7,5.796667,5.89
revue,2.35,2.646667,2.872,3.011
franca,4.863333,7.112,7.623,7.87225
mocuba,6.563333,7.626667,8.486,9.521
nairoto,4.396667,4.99,5.184,5.474
massangena,4.91,5.357333,5.873333,6.513
espungabera,3.113333,3.8,4.21,4.99
dombe,7.973333,9.373333,9.794,10.119667


In [8]:
# save output as a csv 
return_periods_df.to_csv(os.path.join(output_directory, "observed_return_periods.csv"), index=False)