In [1]:
import sys
sys.path.append("../src")

In [2]:
from data_retrieval.util import setup_output_path
from data_processing.util import (
    pre_process_ecmwf_data,
    regrid_climate_data,
    pre_process_era5_data,
    ecmwf_bias_correction,
    compute_quantile_probability
)

### Define file directories

In [3]:
input_path = "./data/input_data" 
temp_path = "./data/temp_data"
output_path = "./data/output_data"

### Create sub-dirs if they do not exist

In [4]:
setup_output_path(temp_path)
setup_output_path(output_path)

### Define file paths

define the naming used for each file, be it input, output or intermediary files.

In [5]:
input_era5_file_name = "era5-monthly-seasonal-forecast-total-precipitation-1981-2023-eth"
input_ecmwf_file_name = "ecmwf-monthly-seasonalforecast-1981-2023-eth"

output_era5_file_name = "era5-ethiopia"
output_ecmwf_file_name = "ecmwf-ethiopia"

admin_file_name = "admin_boundary_eth/eth_admbnda_adm1_csa_bofedb_2021.shp"
admin_code_label = "ADM1_PCODE"

#input datasets
era5_raw_data_file_path = f"{input_path}/{input_era5_file_name}.grib" 
ecmwf_raw_data_file_path = f"{input_path}/{input_ecmwf_file_name}.grib"
admin_boundary_file_path = f"{input_path}/{admin_file_name}"
era5_regrid_file_path = f"{input_path}/{input_era5_file_name}-regrid.nc"

#ECMWF intermediary datasets
ecmwf_processed_pixel_file_path = f"{temp_path}/{output_ecmwf_file_name}-processed-pixel.parquet.gzip"
ecmwf_processed_adm_file_path = f"{temp_path}/{output_ecmwf_file_name}-processed-adm.parquet.gzip"
ecmwf_bias_corr_pixel_file_path = f"{temp_path}/{output_ecmwf_file_name}-bias-corrected-pixel.parquet.gzip"
ecmwf_bias_corr_adm_file_path = f"{temp_path}/{output_ecmwf_file_name}-bias-corrected-adm.parquet.gzip"
ref_grid_file_path = f"{temp_path}/{output_ecmwf_file_name}-reference-grid.parquet.gzip"

#ERA5 intermediary/export datasets
era5_processed_pixel_file_path = f"{output_path}/{output_era5_file_name}-processed-pixel.parquet.gzip"
era5_processed_adm_file_path = f"{output_path}/{output_era5_file_name}-processed-adm.parquet.gzip"

#ECMWF export datasets
ecmwf_quantile_pixel_file_path = f"{output_path}/{output_ecmwf_file_name}-quantile-pixel.parquet.gzip"
ecmwf_quantile_bias_corr_pixel_file_path = f"{output_path}/{output_ecmwf_file_name}-quantile-bias-corrected-pixel.parquet.gzip"
ecmwf_quantile_adm_file_path = f"{output_path}/{output_ecmwf_file_name}-quantile-adm.parquet.gzip"
ecmwf_quantile_bias_corr_adm_file_path = f"{output_path}/{output_ecmwf_file_name}-quantile-bias-corrected-adm.parquet.gzip"

In [6]:
def run_pipeline(perform_regrid):
    #Converts ECMWF grib file into a dataframe, start processing the format and exports it to a parquet file
    pre_process_ecmwf_data(
        ecmwf_raw_data_file_path,
        admin_boundary_file_path,
        ref_grid_file_path,
        ecmwf_processed_pixel_file_path,
        ecmwf_processed_adm_file_path,
        admin_code_label
    )

    #Convert ERA5 grid to the same one used for ECMWF
    if perform_regrid == True:
        regrid_climate_data(era5_raw_data_file_path, ecmwf_raw_data_file_path, era5_regrid_file_path)
    
    #Converts ERA5 grib file into a dataframe, start processing the format and exports it to a parquet file
    pre_process_era5_data(era5_regrid_file_path, admin_boundary_file_path, ref_grid_file_path, era5_processed_pixel_file_path, era5_processed_adm_file_path)
    
    #Correct the bias between ECMWF and ERA5 values (for every location and month). Executes it twice, one at the 
    #grid point level (pixel) and one at the admin boundary level
    ecmwf_bias_correction(ecmwf_processed_pixel_file_path, era5_processed_pixel_file_path, ecmwf_bias_corr_pixel_file_path)
    ecmwf_bias_correction(ecmwf_processed_adm_file_path, era5_processed_adm_file_path, ecmwf_bias_corr_adm_file_path)

    #Compute quantile probabilities based on ERA5 climatology and ECMWF ensemble models. Executes it four times, following
    #the geospatial unit (pixel or administrative boundary level) and the bias correction status (before and after bias correction)
    compute_quantile_probability(ecmwf_processed_pixel_file_path, era5_processed_pixel_file_path, ecmwf_quantile_pixel_file_path)
    compute_quantile_probability(ecmwf_bias_corr_pixel_file_path, era5_processed_pixel_file_path, ecmwf_quantile_bias_corr_pixel_file_path)
    compute_quantile_probability(ecmwf_processed_adm_file_path, era5_processed_adm_file_path, ecmwf_quantile_adm_file_path)
    compute_quantile_probability(ecmwf_bias_corr_adm_file_path, era5_processed_adm_file_path, ecmwf_quantile_bias_corr_adm_file_path)

In [7]:
%%time
run_pipeline(perform_regrid = True)

pre-processing ECMWF data...
0/50



  grid_gdf["geometry"] = grid_gdf["geometry"].buffer(0.5, cap_style=3)


10/50
20/50
30/50
40/50
50/50
pre-processing ECMWF data - done





CPU times: user 16min 39s, sys: 45.3 s, total: 17min 25s
Wall time: 17min 16s
