In [None]:
cd ../../../../

In [2]:
import asyncio
import pandas as pd
import random
from pprint import pprint as print
from tqdm.asyncio import tqdm_asyncio

from dfpp.geo_utils import get_iso3_to_official_name_map
from dfpp.transformation.source_notebooks.ilo_org.retrieve import (
    get_codebook,
    download_indicator_file,
    get_bulk_download_indicator_list
)
from dfpp.transformation.source_notebooks.ilo_org.transform import (
    sanitize_categories,
    transform_indicator,
    SOURCE_NAME
)
from dfpp.transformation.source_notebooks.ilo_org.utils import read_to_df_csv_indicator
from dfpp.publishing.publish import publish_series

MAX_CONCURRENCY = 1

In [3]:
ISO_3_MAP = await get_iso3_to_official_name_map()
data_codes = get_codebook()
df_classif1, df_classif2 = sanitize_categories(data_codes["classif1"], data_codes["classif2"])
data_codes["df_classif1"] = df_classif1
data_codes["df_classif2"] = df_classif2

In [4]:
df_annual_indicators = get_bulk_download_indicator_list()
assert (
    df_annual_indicators.id.value_counts().max() == 1
), "Each indicator must have one record"

In [5]:
async def process_indicator(semaphore, indicator, data_codes, ISO_3_MAP):
    try:
        indicator_id = indicator["id"]

        async with semaphore:
            await asyncio.sleep(random.randint(1, 5))
            file = await download_indicator_file(indicator_id)
            df_source = read_to_df_csv_indicator(file)

            assert df_source.shape[0] > 0, f"The {indicator_id} DataFrame is empty"

            assert (
                abs(df_source.shape[0] - indicator["n_records"]) <= 0.9 * indicator["n_records"]
            ), f"Shape mismatch: expected {indicator['n_records']} but got {df_source.shape[0]}, which exceeds the 90% threshold."

            df = df_source.copy()

            df_indicator = transform_indicator(indicator, df, data_codes, ISO_3_MAP)

            await publish_series(indicator_id, df_indicator, source_folder=SOURCE_NAME + "_bulk_download")

    except Exception as e:
        return indicator_id, e


async def process_all_indicators(
    df_annual_indicators, data_codes, ISO_3_MAP
):

    semaphore = asyncio.Semaphore(MAX_CONCURRENCY)
    failed_indicators = []

    indicators = df_annual_indicators.to_dict(orient="records")

    tasks = [
        process_indicator(semaphore, indicator,  data_codes, ISO_3_MAP)
        for indicator in indicators
    ]

    for future in tqdm_asyncio.as_completed(tasks):
        result = await future
        if isinstance(result, tuple) and len(result) == 2:
            print(result)
            failed_indicators.append(result) 
    return failed_indicators

In [None]:
failed_indicators = await process_all_indicators(
    df_annual_indicators, data_codes, ISO_3_MAP
)

In [7]:
assert len(failed_indicators) == 0, print(failed_indicators)