In [1]:
import asyncio
from pprint import pprint as print

import pandas as pd
from tqdm.asyncio import tqdm_asyncio

from dfpp.transformation.source_notebooks.un_org import (
    get_indicator_list,
    get_series_data_and_dimensions,
)
from dfpp.transformation.source_notebooks.un_org import transform_series
from dfpp.geo_utils import get_numeric_to_iso3_map
from dfpp.publishing import publish_series

MAX_CONCURRENCY = 10

In [3]:
ISO_3_MAP_NUMERIC_TO_ALPHA = await get_numeric_to_iso3_map()
indicators = get_indicator_list()
df_series_codes = pd.DataFrame(indicators)

In [4]:
async def process_one_series(series_id, semaphore):
    try:
        async with semaphore:
            series_data_map, dimension_map = await get_series_data_and_dimensions(
                [series_id]
            )
            df_source = pd.DataFrame(series_data_map[series_id])
            df = df_source.copy()

            assert df.shape[0] > 0, "DataFrame is empty"
            assert (
                df.series.value_counts(dropna=False).shape[0] == 1
            ), "Multiple series values found"
            assert (
                df.shape[0] == dimension_map[series_id]["totalElements"]
            ), "Shape mismatch with expected dimensions"

            print(f"Series description: {df.seriesDescription.iloc[0]}")
            df_dimension_columns_codebook: list[dict[str, str]] = dimension_map[
                series_id
            ]["dimensions"]
            df_series_attributes_codebook: list[dict[str, str]] = dimension_map[
                series_id
            ]["attributes"]

            df_series = transform_series(
                series_id,
                df,
                df_dimension_codebook=df_dimension_columns_codebook,
                df_attribute_codebook=df_series_attributes_codebook,
                iso_3_map_numeric_to_alpha=ISO_3_MAP_NUMERIC_TO_ALPHA,
            )

            await publish_series(series_id, df_series, source_folder="unstats")

    except Exception as e:
        return series_id, e


async def process_all_series(series_codes, max_concurrent_tasks=MAX_CONCURRENCY):

    semaphore = asyncio.Semaphore(max_concurrent_tasks)
    failed_series = []

    tasks = [process_one_series(series_id, semaphore) for series_id in series_codes]

    for future in tqdm_asyncio.as_completed(tasks):
        result = await future
        if isinstance(result, tuple) and len(result) == 2:
            print(result)
            failed_series.append(result)

    return failed_series

In [None]:
failed_series = await process_all_series(
    df_series_codes.code.tolist(), max_concurrent_tasks=MAX_CONCURRENCY
)

In [None]:
assert len(failed_series) == 0, print(failed_series)