In [25]:
import os
import io
import pandas as pd
import numpy as np
import re

from dfpp.storage import StorageManager
from dfpp.geo_utils import change_iso3_to_system_region_iso3
from dfpp.eda import print_essential_data_descriptives
import country_converter as coco

cc = coco.CountryConverter()

In [5]:
async with StorageManager() as storage_manager:
    source_file_name = os.path.join(
        storage_manager.sources_path,
        f"{source_cfg['id'].upper()}.{source_cfg['file_format']}",
    )

    data = await storage_manager.read_blob(path=source_file_name)

In [6]:
df = pd.read_csv(io.BytesIO(data), encoding="latin1")

In [7]:
df.dtypes.value_counts()

float64    1072
object        4
Name: count, dtype: int64

In [8]:
df.head()

Unnamed: 0,iso3,country,hdicode,region,hdi_rank_2022,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,...,pop_total_2013,pop_total_2014,pop_total_2015,pop_total_2016,pop_total_2017,pop_total_2018,pop_total_2019,pop_total_2020,pop_total_2021,pop_total_2022
0,AFG,Afghanistan,Low,SA,182.0,0.284,0.292,0.299,0.307,0.3,...,31.541208,32.71621,33.753499,34.636207,35.643417,36.686784,37.769498,38.972231,40.099462,41.128771
1,ALB,Albania,High,ECA,74.0,0.649,0.632,0.616,0.618,0.623,...,2.887014,2.884102,2.88248,2.881063,2.879355,2.877013,2.873883,2.866849,2.85471,2.842321
2,DZA,Algeria,High,AS,93.0,0.593,0.596,0.601,0.602,0.603,...,38.000627,38.760168,39.543154,40.339329,41.136546,41.927007,42.705368,43.451666,44.177968,44.903225
3,AND,Andorra,Very High,,35.0,,,,,,...,0.071366,0.071622,0.071746,0.07254,0.073836,0.075013,0.076343,0.0777,0.079034,0.079824
4,AGO,Angola,Medium,SSA,150.0,,,,,,...,26.147002,27.128336,28.127721,29.154746,30.208628,31.273533,32.353588,33.428486,34.503774,35.588987


In [9]:
print_essential_data_descriptives(df)

The dataframe has 206 rows and 1076 columns

 Names and data types for each column:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Columns: 1076 entries, iso3 to pop_total_2022
dtypes: float64(1072), object(4)
memory usage: 1.7+ MB
None

 Unique value counts for each column:
iso3                      206
le_m_2016                 206
le_m_2020                 206
le_m_2021                 206
country                   206
                         ... 
gdi_1993                   64
rankdiff_hdi_phdi_2022     52
region                      6
gdi_group_2022              5
hdicode                     4
Length: 1076, dtype: int64

 Missing value counts for each column:
gdi_1990          130
hdi_f_1990        130
hdi_m_1990        130
hdi_f_1992        129
hdi_f_1991        129
                 ... 
le_m_2020           0
le_m_2019           0
le_m_2018           0
le_m_2017           0
pop_total_2022      0
Length: 1076, dtype: int64


In [10]:
source_df = df.copy()

In [11]:
df["cc_iso3"] = cc.pandas_convert(series=df["country"], to="ISO3")

Very high human development not found in regex
High human development not found in regex
Medium human development not found in regex
Low human development not found in regex
Arab States not found in regex
East Asia and the Pacific not found in regex
Europe and Central Asia not found in regex
Latin America and the Caribbean not found in regex
South Asia not found in regex
Sub-Saharan Africa not found in regex
World not found in regex


In [12]:
df.loc[df["cc_iso3"] != df["iso3"]][["iso3", "country"]].to_dict(orient="records")

[{'iso3': 'ZZA.VHHD', 'country': 'Very high human development'},
 {'iso3': 'ZZB.HHD', 'country': 'High human development'},
 {'iso3': 'ZZC.MHD', 'country': 'Medium human development'},
 {'iso3': 'ZZD.LHD', 'country': 'Low human development'},
 {'iso3': 'ZZE.AS', 'country': 'Arab States'},
 {'iso3': 'ZZF.EAP', 'country': 'East Asia and the Pacific'},
 {'iso3': 'ZZG.ECA', 'country': 'Europe and Central Asia'},
 {'iso3': 'ZZH.LAC', 'country': 'Latin America and the Caribbean'},
 {'iso3': 'ZZI.SA', 'country': 'South Asia'},
 {'iso3': 'ZZJ.SSA', 'country': 'Sub-Saharan Africa'},
 {'iso3': 'ZZK.WORLD', 'country': 'World'}]

In [13]:
source_df.rename(columns={"iso3": "Alpha-3 code"}, inplace=True)
source_df.set_index("Alpha-3 code", inplace=True)


rows_to_change_mapping = {
    "ZZA.VHHD": "VHHD",
    "ZZB.HHD": "HHD",
    "ZZC.MHD": "MHD",
    "ZZD.LHD": "LHD",
    "ZZE.AS": "UNDP_AS",
    "ZZF.EAP": "UNDP_EAP",
    "ZZG.ECA": "UNDP_ECA",
    "ZZH.LAC": "UNDP_LAC",
    "ZZI.SA": "UNDP_SA",
    "ZZJ.SSA": "UNDP_SSA",
    "ZZK.WORLD": "WLD",
}
source_df.index = source_df.index.map(lambda x: rows_to_change_mapping.get(x, x))

source_df.reset_index(inplace=True)

In [14]:
# Rename columns based on source_cfg
source_df.rename(
    columns={
        "country": "Country",
        source_cfg.get("country_iso3_column"): "Alpha-3 code",
    },
    inplace=True,
)

In [15]:
source_df["Country or Area"] = source_df["Alpha-3 code"]

In [16]:
source_df.shape

(206, 1077)

In [17]:
source_df.replace("..", np.NaN, inplace=True)
source_df.dropna(inplace=True, axis=1, how="all")

In [18]:
source_df.shape

(206, 1077)

In [19]:
source_df.head()

Unnamed: 0,Alpha-3 code,Country,hdicode,region,hdi_rank_2022,hdi_1990,hdi_1991,hdi_1992,hdi_1993,hdi_1994,...,pop_total_2014,pop_total_2015,pop_total_2016,pop_total_2017,pop_total_2018,pop_total_2019,pop_total_2020,pop_total_2021,pop_total_2022,Country or Area
0,AFG,Afghanistan,Low,SA,182.0,0.284,0.292,0.299,0.307,0.3,...,32.71621,33.753499,34.636207,35.643417,36.686784,37.769498,38.972231,40.099462,41.128771,AFG
1,ALB,Albania,High,ECA,74.0,0.649,0.632,0.616,0.618,0.623,...,2.884102,2.88248,2.881063,2.879355,2.877013,2.873883,2.866849,2.85471,2.842321,ALB
2,DZA,Algeria,High,AS,93.0,0.593,0.596,0.601,0.602,0.603,...,38.760168,39.543154,40.339329,41.136546,41.927007,42.705368,43.451666,44.177968,44.903225,DZA
3,AND,Andorra,Very High,,35.0,,,,,,...,0.071622,0.071746,0.07254,0.073836,0.075013,0.076343,0.0777,0.079034,0.079824,AND
4,AGO,Angola,Medium,SSA,150.0,,,,,,...,27.128336,28.127721,29.154746,30.208628,31.273533,32.353588,33.428486,34.503774,35.588987,AGO


In [29]:
column_substring = indicator_cfg["column_substring"]


columns_to_select = [
    column
    for column in source_df.columns
    if re.sub(r"\d{4}", "", column) == column_substring
]

In [41]:
column_substring 

'pr_m_'

In [42]:

to_reshape = source_df[
    ["Alpha-3 code", "Country or Area", "Country"] + columns_to_select
]

long_df = pd.melt(
    to_reshape,
    id_vars=["Alpha-3 code", "Country or Area", "Country"],
    value_vars=to_reshape.select_dtypes("number").columns,
    var_name="indicator_year",
    value_name=indicator_cfg["indicator_id"],
)

In [43]:
long_df.head()


Unnamed: 0,Alpha-3 code,Country or Area,Country,indicator_year,share_of_seats_in_parliament_male_held_by_men_hdr
0,AFG,AFG,Afghanistan,pr_m_1990,
1,ALB,ALB,Albania,pr_m_1990,
2,DZA,DZA,Algeria,pr_m_1990,96.842105
3,AND,AND,Andorra,pr_m_1990,92.857143
4,AGO,AGO,Angola,pr_m_1990,90.454545


In [44]:

long_df["value"] = long_df["indicator_year"].str.replace(r"\d{4}", "", n=1, regex=True)

long_df["year"] = long_df["indicator_year"].str.extract("(\d{4})")

In [45]:
long_df.head()

Unnamed: 0,Alpha-3 code,Country or Area,Country,indicator_year,share_of_seats_in_parliament_male_held_by_men_hdr,value,year
0,AFG,AFG,Afghanistan,pr_m_1990,,pr_m_,1990
1,ALB,ALB,Albania,pr_m_1990,,pr_m_,1990
2,DZA,DZA,Algeria,pr_m_1990,96.842105,pr_m_,1990
3,AND,AND,Andorra,pr_m_1990,92.857143,pr_m_,1990
4,AGO,AGO,Angola,pr_m_1990,90.454545,pr_m_,1990


In [47]:
assert long_df["value"].value_counts().shape[0] == 1

In [48]:
# assert long_df["year"].value_counts().value_counts().shape[0] == 1
country_aggregate = long_df.groupby("Country or Area").agg(
    indicator_values=(indicator_cfg["indicator_id"], "count")

# are_missing_values_per_country = country_aggregate["indicator_values"].value_counts().value_counts().shape[0] > 1
# assert are_missing_values_per_country == False
# TBD: interpolate missing values per country/region before publishing

)

long_df[["Alpha-3 code", "year", indicator_cfg["indicator_id"]]]

Unnamed: 0,Alpha-3 code,year,share_of_seats_in_parliament_male_held_by_men_hdr
0,AFG,1990,
1,ALB,1990,
2,DZA,1990,96.842105
3,AND,1990,92.857143
4,AGO,1990,90.454545
...,...,...,...
6793,UNDP_ECA,2022,73.987488
6794,UNDP_LAC,2022,65.899166
6795,UNDP_SA,2022,82.069920
6796,UNDP_SSA,2022,73.639784
