In [None]:
import io
import os
import re

import country_converter as coco
import numpy as np
import pandas as pd

from dfpp.storage import StorageManager
from dfpp.storage.postgres import AsyncPGClient

cc = coco.CountryConverter()

In [None]:
async with StorageManager() as storage_manager:
    source_file_name = os.path.join(
        storage_manager.sources_path,
        f"{source_cfg['id'].upper()}.{source_cfg['file_format']}",
    )
    data = await storage_manager.read_blob(path=source_file_name)

In [None]:
if "csv" in source_cfg["file_format"]:
    df = pd.read_csv(io.BytesIO(data), header=2)
elif any(file_format in source_cfg["file_format"] for file_format in {"xls", "xlsx"}):
    df = pd.read_excel(io.BytesIO(data), header=3, sheet_name="Data")

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df_source = df.copy()

In [None]:
df_source.replace("..", np.nan, inplace=True)

In [None]:
source_cfg["country_iso3_column"]

In [None]:
df_source.rename(
    columns={
        source_cfg["country_name_column"]: "country_or_area",
        source_cfg["country_iso3_column"]: "alpha_3_code",
    },
    inplace=True,
)

In [None]:
df_source.info()

In [None]:
df_source["iso3_cc"] = cc.pandas_convert(series=df_source["country_or_area"], to="ISO3")

In [None]:
df_source[df_source["iso3_cc"] != df_source["alpha_3_code"]].shape

In [None]:
assert df_source["Indicator Code"].value_counts().shape[0] == 1
assert df_source["Indicator Name"].value_counts().shape[0] == 1

In [None]:
indicator_columns_to_select = [
    column
    for column in df_source.select_dtypes("number").columns
    if re.search(r"\d{4}", column)
]

In [None]:
indicator_columns_to_select

In [None]:
df_long = df_source.melt(
    id_vars=["alpha_3_code", "country_or_area"],
    value_vars=indicator_columns_to_select,
    var_name="year",
    value_name=indicator_cfg["indicator_id"],
)

In [None]:
df_long.shape

In [None]:
df_long.head()

In [None]:
df_long["year"] = df_long["year"].astype(int)

In [None]:
df_long = df_long[["alpha_3_code", "country_or_area", "year", indicator_cfg["indicator_id"]]]

In [None]:
async with AsyncPGClient() as pg_client:
    await pg_client.insert_indicator(indicator_cfg["indicator_id"], df_long)