In [None]:
import io
import os
import re

import country_converter as coco
import numpy as np
import pandas as pd

from dfpp.eda import print_essential_data_descriptives
from dfpp.storage import StorageManager
from dfpp.storage.postgres import AsyncPGClient

cc = coco.CountryConverter()

In [None]:
async with StorageManager() as storage_manager:
    source_file_name = os.path.join(
        storage_manager.sources_path,
        f"{source_cfg['id'].upper()}.{source_cfg['file_format']}",
    )

    data = await storage_manager.read_blob(path=source_file_name)

In [None]:
df = pd.read_csv(io.BytesIO(data), encoding="latin1")

In [None]:
df.dtypes.value_counts()

In [None]:
df.head()

In [None]:
print_essential_data_descriptives(df)

In [None]:
df_source = df.copy()

In [None]:
df["cc_iso3"] = cc.pandas_convert(series=df["country"], to="ISO3")

In [None]:
df.loc[df["cc_iso3"] != df["iso3"]][["iso3", "country"]].to_dict(orient="records")

In [None]:
df_source.rename(columns={"iso3": "alpha_3_code"}, inplace=True)
df_source.set_index("alpha_3_code", inplace=True)


rows_to_change_mapping = {
    "ZZA.VHHD": "VHHD",
    "ZZB.HHD": "HHD",
    "ZZC.MHD": "MHD",
    "ZZD.LHD": "LHD",
    "ZZE.AS": "UNDP_AS",
    "ZZF.EAP": "UNDP_EAP",
    "ZZG.ECA": "UNDP_ECA",
    "ZZH.LAC": "UNDP_LAC",
    "ZZI.SA": "UNDP_SA",
    "ZZJ.SSA": "UNDP_SSA",
    "ZZK.WORLD": "WLD",
}
df_source.index = df_source.index.map(lambda x: rows_to_change_mapping.get(x, x))

df_source.reset_index(inplace=True)
# Rename columns based on source_cfg
df_source.rename(
    columns={
        "country": "country_or_area",
        source_cfg.get("country_iso3_column"): "alpha_3_code",
    },
    inplace=True,
)

In [None]:
df_source.shape

In [None]:
df_source.replace("..", np.NaN, inplace=True)
df_source.dropna(inplace=True, axis=1, how="all")
df_source.shape
df_source.head()
column_substring = indicator_cfg["column_substring"]

In [None]:
df_source.shape

In [None]:
column_substring

In [None]:
columns_to_select = [
    column
    for column in df_source.columns
    if re.sub(r"\d{4}", "", column) == column_substring
]

In [None]:
columns_to_select

In [None]:
to_reshape = df_source[["alpha_3_code", "country_or_area"] + columns_to_select]

In [None]:
df_long = pd.melt(
    to_reshape,
    id_vars=["alpha_3_code", "country_or_area"],
    value_vars=to_reshape.select_dtypes("number").columns,
    var_name="indicator_year",
    value_name=indicator_cfg["indicator_id"],
)

In [None]:
df_long.head()

In [None]:
df_long["value"] = df_long["indicator_year"].str.replace(r"\d{4}", "", n=1, regex=True)
df_long["year"] = df_long["indicator_year"].str.extract("(\d{4})")
df_long["year"] = df_long["year"].astype(int)

In [None]:
df_long.head()

In [None]:
assert df_long["value"].value_counts().shape[0] == 1

In [None]:
df_long = df_long[
    ["alpha_3_code", "country_or_area", "year", indicator_cfg["indicator_id"]]
]

In [None]:
df_long.head()

In [None]:
async with AsyncPGClient() as pg_client:
    await pg_client.insert_indicator(indicator_cfg["indicator_id"], df_long)