In [None]:
from io import BytesIO
import pandas as pd
from dfpp.storage import StorageManager
from dfpp.transformation.geo_utils import get_iso3_to_official_name_map
from dfpp.publishing import publish_series

from dfpp.transformation.column_name_template import (
    CANONICAL_COLUMN_NAMES,
    DIMENSION_COLUMN_PREFIX,
    SERIES_PROPERTY_PREFIX,
    sort_columns_canonically,
    ensure_canonical_columns,
)
from dfpp.transformation.value_handler import handle_value

FILE_PATH = "backup/sources/KPAtlasDB_2024_en.csv"
SOURCE_NAME = "unaids_org"
BASE_URL = "https://www.unaids.org/en"

In [None]:
ISO_3_MAP = await get_iso3_to_official_name_map()

In [None]:
async with StorageManager() as storage_manager:
    series_bytes = await storage_manager.read_blob(path=FILE_PATH)
    df_source = pd.read_csv(BytesIO(series_bytes), keep_default_na=True)
df = df_source.copy()

In [None]:
df["series_name"] = df["Indicator"] + " " + "(" + df["Subgroup"] + ")"
df["series_id"] = df["series_name"].apply(
    lambda x: "_".join(
        [i[:4].strip("(").strip(")").strip(":") for i in x.upper().split()]
    )
)
df["source"] = SOURCE_NAME

In [None]:
COLUMNS_TO_RENAME = {
    "Unit": SERIES_PROPERTY_PREFIX + "unit",
    "Time Period": "year",
    "Data value": "value",
    "Area ID": "alpha_3_code",
}
df.rename(columns=COLUMNS_TO_RENAME, inplace=True)
df = df.loc[df["alpha_3_code"].isin(list(ISO_3_MAP.keys()))].reset_index()
df[["value", SERIES_PROPERTY_PREFIX + "value_label"]] = df.apply(
    handle_value, axis=1, result_type="expand"
)
df = df[df.value.notna()]

In [None]:
for indicator_id, df_indicator in df.groupby("series_id"):
    to_select_columns = [
        col
        for col in df_indicator.columns
        if any(
            [
                col.startswith(DIMENSION_COLUMN_PREFIX),
                col.startswith(SERIES_PROPERTY_PREFIX),
            ]
        )
        and col not in CANONICAL_COLUMN_NAMES
    ]
    df_indicator = ensure_canonical_columns(df_indicator)
    df_indicator = df_indicator[CANONICAL_COLUMN_NAMES + to_select_columns]
    df_indicator = sort_columns_canonically(df_indicator)
    await publish_series(indicator_id, df_indicator, source_folder=SOURCE_NAME)