In [1]:

cd ../../../../

/Users/mykhailoslukvin/repo/dv-data-pipeline


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import asyncio
from pprint import pprint as print

import pandas as pd
from tqdm.asyncio import tqdm_asyncio

from dfpp.transformation.source_notebooks.un_org import (
    get_indicators,
    get_iso3_map,
    get_series_data_and_dimensions,
)
from dfpp.transformation.source_notebooks.un_org import transform_series
from dfpp.transformation.source_notebooks.un_org import publish_series

MAX_CONCURRENCY = 10

In [3]:
ISO_3_MAP = await get_iso3_map()
indicators = get_indicators()
series_codes = set([i["seriesCode"] for i in indicators])

In [4]:
async def process_one_series(series_id, semaphore):
    try:
        async with semaphore:
            series_data_map, dimension_map = await get_series_data_and_dimensions(
                [series_id]
            )
            df_source = pd.DataFrame(series_data_map[series_id])
            df = df_source.copy()

            assert df.shape[0] > 0, "DataFrame is empty"
            assert (
                df.series.value_counts(dropna=False).shape[0] == 1
            ), "Multiple series values found"
            assert (
                df.shape[0] == dimension_map[series_id]["totalElements"]
            ), "Shape mismatch with expected dimensions"

            print(f"Series description: {df.seriesDescription.iloc[0]}")
            dimension_columns: list[str] = dimension_map[series_id]["dimensions"]

            df_series = transform_series(df, dimension_columns, ISO_3_MAP)

            await publish_series(series_id, df_series)

    except Exception as e:
        return series_id, e


async def process_all_series(series_codes, max_concurrent_tasks=MAX_CONCURRENCY):

    semaphore = asyncio.Semaphore(max_concurrent_tasks)
    failed_series = []

    tasks = [process_one_series(series_id, semaphore) for series_id in series_codes]

    for future in tqdm_asyncio.as_completed(tasks):
        result = await future
        if isinstance(result, tuple) and len(result) == 2:
            failed_series.append(result)

    return failed_series

In [5]:
failed_series = await process_all_series(
    list(series_codes)[:2], max_concurrent_tasks=MAX_CONCURRENCY
)

  0%|          | 0/2 [00:00<?, ?it/s]

'Series description: Proportion of safely treated domestic wastewater flows (%)'


 50%|█████     | 1/2 [00:04<00:04,  4.16s/it]

'Series description: Annual GDP growth (%)'


100%|██████████| 2/2 [00:29<00:00, 14.70s/it]


In [7]:
assert len(failed_series) == 0, print(failed_series)