In [1]:
# Parameters
source_cfg = {
    "id": "CPIA_PHR55",
    "name": "World bank data",
    "url": "https://api.worldbank.org/v2/en/indicator/SI.POV.UMIC?downloadformat=csv",
    "frequency": "Daily",
    "source_type": "Auto",
    "save_as": "CPIA_PHR55.csv",
    "file_format": "csv",
    "downloader_function": "cpia_downloader",
    "country_iso3_column": "Country Code",
    "country_name_column": "Country Name",
    "datetime_column": None,
    "year": None,
    "group_column": None,
    "downloader_params": {"file": "API_SI.POV.UMIC_DS2_en_csv_v2_4538781.csv"},
    "file_name": None,
}
indicator_cfg = {
    "indicator_id": "povertyheadcountratioat5.50_cpiaphr55",
    "indicator_name": "Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population)",
    "display_name": "Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population)",
    "source_id": "CPIA_PHR55",
    "data_type": "float",
    "frequency": "Daily",
    "aggregate_type": "PositiveSum",
    "preprocessing": "cpia_transform_preprocessing",
    "source_field_name": "",
    "transform_function": "type1_transform",
    "group_name": "Female",
    "year": 2021,
    "value_column": "",
    "column_substring": "eys_",
    "sheet_name": "WBentp1",
    "denominator_indicator_id": "totalpopulation_untp",
    "per_capita": 100.0,
    "min_year": None,
    "filter_sex_column": None,
    "filter_frequency_column": None,
    "filter_age_column": None,
    "filter_ste_column": None,
    "filter_eco_column": None,
    "filter_value_column": None,
    "value_col": None,
    "divisor": None,
    "dividend": None,
}

In [2]:
pwd

'/Users/mykhailoslukvin/repo/dv-data-pipeline/dfpp/transformation/source_notebooks/CPIA_PHR55/indicator_execution'

In [3]:
cd ../../../../../

/Users/mykhailoslukvin/repo/dv-data-pipeline


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
import os
import io
import pandas as pd
import numpy as np
import re

from dfpp.storage import StorageManager
import country_converter as coco

cc = coco.CountryConverter()

In [50]:
async with StorageManager() as storage_manager:
    source_file_name = os.path.join(
        storage_manager.sources_path,
        f"{source_cfg['id'].upper()}.{source_cfg['file_format']}",
    )

    data = await storage_manager.read_blob(path=source_file_name)

In [None]:
df = pd.read_csv(io.BytesIO(data), header=2)

In [None]:
df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022',
       '2023', 'Unnamed: 68'],
      dtype='object')

In [None]:
source_df = df.copy()

In [None]:
source_df.replace("..", np.nan, inplace=True)

In [None]:
source_cfg["country_iso3_column"]

'Country Code'

In [None]:
source_df.rename(
    columns={
        source_cfg["country_name_column"]: "Country",
        source_cfg["country_iso3_column"]: "Alpha-3 code",
    },
    inplace=True,
)

In [None]:
source_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 69 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         266 non-null    object 
 1   Alpha-3 code    266 non-null    object 
 2   Indicator Name  266 non-null    object 
 3   Indicator Code  266 non-null    object 
 4   1960            0 non-null      float64
 5   1961            0 non-null      float64
 6   1962            0 non-null      float64
 7   1963            1 non-null      float64
 8   1964            1 non-null      float64
 9   1965            1 non-null      float64
 10  1966            1 non-null      float64
 11  1967            1 non-null      float64
 12  1968            2 non-null      float64
 13  1969            2 non-null      float64
 14  1970            3 non-null      float64
 15  1971            3 non-null      float64
 16  1972            2 non-null      float64
 17  1973            3 non-null      flo

In [None]:
source_df["iso3_cc"] = cc.pandas_convert(series=source_df["Country"], to="ISO3")

Africa Eastern and Southern not found in regex
Africa Western and Central not found in regex


Arab World not found in regex
Central Europe and the Baltics not found in regex
Channel Islands not found in regex
Caribbean small states not found in regex
East Asia & Pacific ( not found in regex
Early-demographic dividend not found in regex
East Asia & Pacific not found in regex
Europe & Central Asia ( not found in regex
Europe & Central Asia not found in regex
Euro area not found in regex
European Union not found in regex
Fragile and conflict affected situations not found in regex
High income not found in regex
Heavily indebted poor countries (HIPC) not found in regex
IBRD only not found in regex
IDA & IBRD total not found in regex
IDA total not found in regex
IDA blend not found in regex
IDA only not found in regex
Not classified not found in regex
Latin America & Caribbean ( not found in regex
Latin America & Caribbean not found in regex
Least developed countries: UN classification not found in regex
Low income not found in regex
Lower middle income not found in regex
Low & middl

In [None]:
source_df[source_df["iso3_cc"] != source_df["Alpha-3 code"]]

Unnamed: 0,Country,Alpha-3 code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68,iso3_cc
1,Africa Eastern and Southern,AFE,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
3,Africa Western and Central,AFW,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
7,Arab World,ARB,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
36,Central Europe and the Baltics,CEB,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
38,Channel Islands,CHI,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
49,Caribbean small states,CSS,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
61,East Asia & Pacific (excluding high income),EAP,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
62,Early-demographic dividend,EAR,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found
63,East Asia & Pacific,EAS,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,40.3,38.1,34.9,32.3,32.5,29.6,29.2,,,not found
64,Europe & Central Asia (excluding high income),ECA,Poverty headcount ratio at $6.85 a day (2017 P...,SI.POV.UMIC,,,,,,,...,,,,,,,,,,not found


In [None]:
assert source_df["Indicator Code"].value_counts().shape[0] == 1
assert source_df["Indicator Name"].value_counts().shape[0] == 1

In [None]:
indicator_columns_to_select = [
    column
    for column in source_df.select_dtypes("number").columns
    if re.search(r"\d{4}", column)
]

In [None]:
indicator_columns_to_select

['1960',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023']

In [100]:
long_df = source_df.melt(
    id_vars=["Alpha-3 code", "Country"],
    value_vars=indicator_columns_to_select,
    var_name="year",
    value_name=indicator_cfg["indicator_id"],
)

In [69]:
@StorageManager.with_storage_manager
def _get_population_data(storage_manager):
    population_path = os.path.join(storage_manager.utilities_path, "population.csv")
    popuation_data = await storage_manager.read_blob(path=population_path)
    population_df = pd.read_csv(io.BytesIO(popuation_data))

    long_population = population_df.melt(
        id_vars=["Alpha-3 code"],
        value_vars=population_df.select_dtypes("number"),
        value_name="totalpopulation_untp_year",
    )
    long_population["totalpopulation_untp"] = long_population["variable"].str.extract(
        r"(\D+)"
    )
    long_population["year"] = long_population["variable"].str.extract(r"(\d+)")
    return long_population

In [79]:
long_df

Unnamed: 0,Alpha-3 code,Country,year,povertyheadcountratioat5.50_cpiaphr55
0,ABW,Aruba,1960,
1,AFE,Africa Eastern and Southern,1960,
2,AFG,Afghanistan,1960,
3,AFW,Africa Western and Central,1960,
4,AGO,Angola,1960,
...,...,...,...,...
17019,XKX,Kosovo,2023,
17020,YEM,"Yemen, Rep.",2023,
17021,ZAF,South Africa,2023,
17022,ZMB,Zambia,2023,


In [156]:
long_df["year"] = long_df["year"].astype(int)

In [197]:
from dfpp.transformation.interpolation import calculate_gappiness_index, interpolate_data

In [201]:
gappiness_df = calculate_gappiness_index(long_df, indicator_cfg["indicator_id"])


In [205]:
gappiness_df.head(10)

Unnamed: 0,Alpha-3 code,gappiness_index,observed_years,missing_years,year_min,year_max
0,ABW,,0.0,,,
1,AFE,,0.0,,,
2,AFG,,0.0,,,
3,AFW,,0.0,,,
4,AGO,2.6924,3.0,16.0,2000.0,2018.0
5,ALB,1.186086,12.0,13.0,1996.0,2020.0
6,AND,,0.0,,,
7,ARB,,0.0,,,
8,ARE,1.739625,2.0,4.0,2013.0,2018.0
9,ARG,0.499457,34.0,9.0,1980.0,2022.0


In [202]:
gappiness_df.describe()

Unnamed: 0,gappiness_index,observed_years,missing_years,year_min,year_max
count,182.0,266.0,182.0,182.0,182.0
mean,1.14796,9.819549,10.43956,1994.478022,2018.269231
std,0.973538,12.332705,9.066615,10.973955,4.814692
min,0.0,0.0,0.0,1963.0,1992.0
25%,0.01245,0.0,0.25,1986.0,2017.0
50%,1.111542,4.5,9.0,1994.0,2020.0
75%,2.026473,18.0,18.0,2003.0,2021.0
max,3.306734,59.0,31.0,2022.0,2023.0


In [222]:
interpolated_long_df = interpolate_data(long_df, indicator_cfg["indicator_id"])

  group[indicator_id] = group[indicator_id].fillna(


In [234]:
interpolated_long_df.tail(20)

Unnamed: 0,Alpha-3 code,Country,year,povertyheadcountratioat5.50_cpiaphr55
17004,ZWE,Zimbabwe,2004,
17005,ZWE,Zimbabwe,2005,
17006,ZWE,Zimbabwe,2006,
17007,ZWE,Zimbabwe,2007,
17008,ZWE,Zimbabwe,2008,
17009,ZWE,Zimbabwe,2009,
17010,ZWE,Zimbabwe,2010,
17011,ZWE,Zimbabwe,2011,77.9
17012,ZWE,Zimbabwe,2012,77.9
17013,ZWE,Zimbabwe,2013,77.9
