# Import dependencies

In [2]:
from io import BytesIO
import pandas as pd
import re

from dfpp.storage import StorageManager
from dfpp.transformation.geo_utils import get_iso3_to_official_name_map
from dfpp.publishing import publish_series

from dfpp.transformation.column_name_template import (
    CANONICAL_COLUMN_NAMES,
    DIMENSION_COLUMN_PREFIX,
    SERIES_PROPERTY_PREFIX,
    sort_columns_canonically,
    ensure_canonical_columns,
)
from dfpp.transformation.value_handler import handle_value

FILE_PATH = "backup/sources/KPAtlasDB_2024_en.csv"
SOURCE_NAME = "unaids_org"
BASE_URL = "https://kpatlas.unaids.org/dashboard"

In [3]:
ISO_3_MAP = await get_iso3_to_official_name_map()

# Load source file

In [4]:
async with StorageManager() as storage_manager:
    series_bytes = await storage_manager.read_blob(path=FILE_PATH)
    df_source = pd.read_csv(BytesIO(series_bytes), keep_default_na=True)
df = df_source.copy()

# Transform the data

In [5]:
df["series_name"] = df["Indicator"]
df["series_id"] = df["series_name"].apply(
    lambda x: re.sub(r"_+", "_", re.sub(r"[-/=:()]", "_", x.upper()).replace(" ", "_"))
)
df["source"] = BASE_URL

In [6]:
COLUMNS_TO_RENAME = {
    "Unit": SERIES_PROPERTY_PREFIX + "unit",
    "Time Period": "year",
    "Data value": "value",
    "Area ID": "alpha_3_code",
}
df.rename(columns=COLUMNS_TO_RENAME, inplace=True)
df = df.loc[df["alpha_3_code"].isin(list(ISO_3_MAP.keys()))].reset_index()
df[["value", SERIES_PROPERTY_PREFIX + "value_label"]] = df.apply(
    handle_value, axis=1, result_type="expand"
)
df = df[df.value.notna()]

In [7]:
SUBGROUP_CATEGORIES = {
    "demographic_group": [
        "15-19",
        "15-24",
        "20-24",
        "< 25",
        "25+",
        "All ages",
        "Females",
        "Males",
        "Total",
        "Transgender",
        "Transman",
        "Transother",
        "Transwoman",
        "All sexes",
        "Men who have sex with men Domestic (Public and Private)",
        "Men who have sex with men International",
        "Men who have sex with men Total",
        "People who inject drugs Domestic (Public and Private)",
        "People who inject drugs International",
        "People who inject drugs Total",
        "Prisoners Domestic (Public and Private)",
        "Prisoners International",
        "Prisoners Total",
        "Sex workers Domestic (Public and Private)",
        "Sex workers International",
        "Sex workers Total",
        "Transgender people Domestic (Public and Private)",
        "Transgender people International",
        "Transgender people Total",
        "Hepatitis B Total",
        "Hepatitis C Total",
        "Hepatitis B < 25",
        "Hepatitis B 25+",
        "Hepatitis B Females",
        "Hepatitis B Males",
        "Hepatitis B Transgender",
        "Hepatitis C < 25",
        "Hepatitis C 25+",
        "Hepatitis C Females",
        "Hepatitis C Males",
        "Hepatitis C Transgender",
    ],
    "estimate_type": [
        "estimate",
        "lower estimate",
        "upper estimate",
        "From national authorities Total",
        "compiled",
    ],
}

In [None]:
def assign_columns(subgroup: str):
    """given heterogenous subgroup column, assign demographic group or estimate type columns"""
    if subgroup in SUBGROUP_CATEGORIES["demographic_group"]:
        return subgroup, None
    elif subgroup in SUBGROUP_CATEGORIES["estimate_type"]:
        return None, subgroup
    else:
        return None, None


df[
    [
        DIMENSION_COLUMN_PREFIX + "demographic_group",
        SERIES_PROPERTY_PREFIX + "observation_type",
    ]
] = df["Subgroup"].apply(lambda x: pd.Series(assign_columns(x)))

# Publish each Indicator series separately selecting only relevant columns

In [9]:
for indicator_id, df_indicator in df.groupby("series_id"):
    to_select_columns = [
        col
        for col in df_indicator.columns
        if any(
            [
                col.startswith(DIMENSION_COLUMN_PREFIX),
                col.startswith(SERIES_PROPERTY_PREFIX),
            ]
        )
        and col not in CANONICAL_COLUMN_NAMES
    ]
    if df_indicator.value.isna().all():
        print(indicator_id, "All values are None")
        continue
    df_indicator = ensure_canonical_columns(df_indicator)
    df_indicator = df_indicator[CANONICAL_COLUMN_NAMES + to_select_columns]
    df_indicator = sort_columns_canonically(df_indicator)
    await publish_series(indicator_id, df_indicator, source_folder=SOURCE_NAME)