# Initial Setup

In [None]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


from src.data_processing.custom_python_package import *

from src.data_retrieval.util import setup_output_path

In [None]:
# Adapt to the Path where data is stored
os.environ["DATA_PATH"] = os.path.expanduser("./data")
base_data_path = os.getenv("DATA_PATH")

input_data_path = os.path.join(base_data_path, "input_data/")
output_data_path = os.path.join(base_data_path, "output_data/")

input_era5_file_name = "era5-total-precipitation-1981-2023"
# input_ecmwf_file_name = 'ecmwf-monthly-seasonalforecast-1981-2023-002'
input_ecmwf_file_name = "ecmwf-monthly-seasonalforecast-1981-2023-eth"

output_era5_file_name = "era5-ethiopia"
output_ecmwf_file_name = "ecmwf-ethiopia"

admin_file_name = os.path.join(
    "admin_boundary_eth", "eth_admbnda_adm1_csa_bofedb_2021.shp"
)
admin_code_label = "ADM1_PCODE"

setup_output_path(output_data_path)

# Functions

In [None]:
def create_file_name(
    input_data_path,
    output_data_path,
    input_era5_file_name,
    input_ecmwf_file_name,
    output_era5_file_name,
    output_ecmwf_file_name,
    admin_file_name,
    admin_code_label,
):

    global era5_raw_data_file_path, ecmwf_raw_data_file_path, admin_boundary_file_path
    global ref_grid_file_path
    global ecmwf_processed_pixel_file_path, ecmwf_processed_adm_file_path, era5_processed_pixel_file_path, era5_processed_adm_file_path

    # input datasets
    era5_raw_data_file_path = input_data_path + input_era5_file_name + ".grib"
    ecmwf_raw_data_file_path = (
        input_data_path + input_ecmwf_file_name + ".grib"
    )
    admin_boundary_file_path = input_data_path + admin_file_name

    # export datasets
    ref_grid_file_path = (
        output_data_path
        + output_ecmwf_file_name
        + "-reference-grid"
        + ".parquet.gzip"
    )
    ecmwf_processed_pixel_file_path = (
        output_data_path
        + output_ecmwf_file_name
        + "-processed-pixel"
        + ".parquet.gzip"
    )
    ecmwf_processed_adm_file_path = (
        output_data_path
        + output_ecmwf_file_name
        + "-processed-adm"
        + ".parquet.gzip"
    )
    era5_processed_pixel_file_path = (
        output_data_path
        + output_era5_file_name
        + "-processed-pixel"
        + ".parquet.gzip"
    )
    era5_processed_adm_file_path = (
        output_data_path
        + output_era5_file_name
        + "-processed-adm"
        + ".parquet.gzip"
    )

    return ()

In [None]:
def run_pipeline():

    # Converts ECMWF grib file into a dataframe,
    # start processing the format and exports it to a parquet file
    pre_process_ecmwf_data(
        ecmwf_raw_data_file_path,
        admin_boundary_file_path,
        ref_grid_file_path,
        ecmwf_processed_pixel_file_path,
        ecmwf_processed_adm_file_path,
        admin_code_label,
    )

    # Converts ERA5 grib file into a dataframe,
    # start processing the format and exports it to a parquet  file
    pre_process_era5_data(
        era5_raw_data_file_path,
        admin_boundary_file_path,
        ref_grid_file_path,
        era5_processed_pixel_file_path,
        era5_processed_adm_file_path,
    )

    # Correct the bias between ECMWF and ERA5 values (for every location and month).
    # Executes it twice, one at the
    # grid point level (pixel) and one at the admin boundary level
    ecmwf_bias_correction(
        ecmwf_processed_pixel_file_path, era5_processed_pixel_file_path
    )

    ecmwf_bias_correction(
        ecmwf_processed_adm_file_path, era5_processed_adm_file_path
    )

    return ()

# Setup file names and run pipeline

In [None]:
create_file_name(
    input_data_path,
    output_data_path,
    input_era5_file_name,
    input_ecmwf_file_name,
    output_era5_file_name,
    output_ecmwf_file_name,
    admin_file_name,
    admin_code_label,
)

In [None]:
# Measure the execution time of `run_pipeline`

import time

start_time = time.time()
run_pipeline()
end_time = time.time()

execution_time_seconds = end_time - start_time

execution_time_minutes = execution_time_seconds / 60
print(f"run_pipeline() execution time: {execution_time_minutes:.2f} minutes")