In [1]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


from src.data_processing.custom_python_package import *

from src.data_retrieval.util import setup_output_path


In [2]:

input_data_path = './data/input_data/'
temp_data_path = './data/temp_data/'
output_data_path = './data/output_data/'

input_era5_file_name = 'era5-total-precipitation-1981-2023'
input_ecmwf_file_name = 'ecmwf-monthly-seasonalforecast-1981-2023-002'
#input_ecmwf_file_name = 'ecmwf-monthly-seasonalforecast-1981-2023-eth'

output_era5_file_name = 'era5-ethiopia'
output_ecmwf_file_name = 'ecmwf-ethiopia'

admin_file_name = 'admin_boundary_eth/eth_admbnda_adm1_csa_bofedb_2021.shp'
admin_code_label = 'ADM1_PCODE'

setup_output_path(temp_data_path)
setup_output_path(output_data_path)

In [3]:
def create_file_name(input_data_path, 
                     temp_data_path, 
                     output_data_path, 
                     input_era5_file_name, 
                     input_ecmwf_file_name, 
                     output_era5_file_name, 
                     output_ecmwf_file_name, 
                     admin_file_name, 
                     admin_code_label
                     ):  

    
    global era5_raw_data_file_path, ecmwf_raw_data_file_path, admin_boundary_file_path, ecmwf_processed_pixel_file_path 
    global ecmwf_processed_adm_file_path, ecmwf_bias_corr_pixel_file_path, ecmwf_bias_corr_adm_file_path, ref_grid_file_path 
    global era5_processed_pixel_file_path, era5_processed_adm_file_path, ecmwf_quantile_pixel_file_path
    global ecmwf_quantile_bias_corr_pixel_file_path, ecmwf_quantile_adm_file_path, ecmwf_quantile_bias_corr_adm_file_path
    
    #input datasets
    era5_raw_data_file_path = input_data_path + input_era5_file_name + '.grib'    
    ecmwf_raw_data_file_path = input_data_path + input_ecmwf_file_name + '.grib'
    admin_boundary_file_path = input_data_path + admin_file_name
    
    #ECMWF intermediary datasets    
    ecmwf_processed_pixel_file_path =  temp_data_path + output_ecmwf_file_name + '-processed-pixel' + '.parquet.gzip'
    ecmwf_processed_adm_file_path = temp_data_path + output_ecmwf_file_name + '-processed-adm' + '.parquet.gzip'
    ecmwf_bias_corr_pixel_file_path = temp_data_path + output_ecmwf_file_name + '-bias-corrected-pixel' + '.parquet.gzip'
    ecmwf_bias_corr_adm_file_path = temp_data_path + output_ecmwf_file_name + '-bias-corrected-adm' + '.parquet.gzip'
    ref_grid_file_path = temp_data_path + output_ecmwf_file_name + '-reference-grid' + '.parquet.gzip'
    
    
    #ERA5 intermediary/export datasets
    era5_processed_pixel_file_path =  output_data_path + output_era5_file_name + '-processed-pixel' + '.parquet.gzip'
    era5_processed_adm_file_path = output_data_path + output_era5_file_name + '-processed-adm' + '.parquet.gzip'
    
    #ECMWF export datasets
    ecmwf_quantile_pixel_file_path = output_data_path + output_ecmwf_file_name + '-quantile-pixel' + '.parquet.gzip'
    ecmwf_quantile_bias_corr_pixel_file_path = output_data_path + output_ecmwf_file_name + '-quantile-bias-corrected-pixel' + '.parquet.gzip'
    ecmwf_quantile_adm_file_path = output_data_path + output_ecmwf_file_name + '-quantile-adm' + '.parquet.gzip'
    ecmwf_quantile_bias_corr_adm_file_path = output_data_path + output_ecmwf_file_name + '-quantile-bias-corrected-adm' + '.parquet.gzip'


    return()

In [4]:
def run_pipeline():

    # Converts ECMWF grib file into a dataframe, 
    # start processing the format and exports it to a parquet file
    pre_process_ecmwf_data(ecmwf_raw_data_file_path, 
                           admin_boundary_file_path, 
                           ref_grid_file_path, 
                           ecmwf_processed_pixel_file_path, 
                           ecmwf_processed_adm_file_path, 
                           admin_code_label
                           )
    
    # Converts ERA5 grib file into a dataframe, 
    # start processing the format and exports it to a parquet file
    pre_process_era5_data(era5_raw_data_file_path, 
                          admin_boundary_file_path, 
                          ref_grid_file_path, 
                          era5_processed_pixel_file_path, 
                          era5_processed_adm_file_path
                          )
    
    #Correct the bias between ECMWF and ERA5 values (for every location and month). 
    # Executes it twice, one at the 
    #grid point level (pixel) and one at the admin boundary level
    ecmwf_bias_correction(ecmwf_processed_pixel_file_path, 
                          era5_processed_pixel_file_path, 
                          ecmwf_bias_corr_pixel_file_path
                          )
    
    ecmwf_bias_correction(ecmwf_processed_adm_file_path, 
                          era5_processed_adm_file_path, 
                          ecmwf_bias_corr_adm_file_path
                          )

    #Compute quantile probabilities based on ERA5 climatology and ECMWF ensemble models. 
    # Executes it four times, following
    #the geospatial unit (pixel or administrative boundary level) 
    # and the bias correction status (before and after bias correction)
    compute_quantile_probability(ecmwf_processed_pixel_file_path, 
                                 era5_processed_pixel_file_path, 
                                 ecmwf_quantile_pixel_file_path
                                 )
    
    compute_quantile_probability(ecmwf_bias_corr_pixel_file_path, 
                                 era5_processed_pixel_file_path, 
                                 ecmwf_quantile_bias_corr_pixel_file_path
                                 )
    
    compute_quantile_probability(ecmwf_processed_adm_file_path, 
                                 era5_processed_adm_file_path, 
                                 ecmwf_quantile_adm_file_path
                                 )
    
    compute_quantile_probability(ecmwf_bias_corr_adm_file_path, 
                                 era5_processed_adm_file_path, 
                                 ecmwf_quantile_bias_corr_adm_file_path
                                 )
    
    return()

In [5]:
create_file_name(input_data_path, 
                 temp_data_path, 
                 output_data_path, 
                 input_era5_file_name, 
                 input_ecmwf_file_name, 
                 output_era5_file_name, 
                 output_ecmwf_file_name, 
                 admin_file_name, 
                 admin_code_label
                 )  




()

In [6]:
# Measure the execution time of `run_pipeline`
import time

start_time = time.time()
run_pipeline()
end_time = time.time()

execution_time_seconds = end_time - start_time

execution_time_minutes = execution_time_seconds / 60
print(f"run_pipeline() execution time: {execution_time_minutes:.2f} minutes")

pre-processing ECMWF data...
0/50



  grid_df["geometry"] = grid_df["geometry"].buffer(0.5, cap_style=3)


10/50
20/50
30/50
40/50
50/50


Ignoring index file './data/input_data/era5-total-precipitation-1981-2023.grib.9093e.idx' incompatible with GRIB file


pre-processing ECMWF data - done
run_pipeline() execution time: 11.08 minutes
