##### Imports

In [3]:
#| label: imports
import requests
import pandas as pd
import os.path

from IPython.display import display

##### data fetching functions

In [4]:
def get_from_url(url: str,
                 filename: str,
                 force_csv: bool=False,
                 verbose: bool =False, **_) -> pd.DataFrame:
    """Read, write locally, and parse with pandas a remote csv from its url.
    Args:
        url (str): The data's url.
        filename (str): The filename where the data will be written. Should be a .csv file.
    Returns:
        pd.DataFrame: The data inside a dataframe.
    """
    if force_csv and "&format=csvfile" not in url:
        url += "&format=csvfile"

    # headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"}

    if verbose:
        print("fetching", url, "...")
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        with open(filename, "wb") as csv:
            for chunk in response.iter_content(4096):
                csv.write(chunk)
        df = pd.read_csv(filename)
    return df

def get_data(url: str, filename: str, force_refresh: bool=False, verbose: bool =False, **kwargs) -> pd.DataFrame:
    if verbose: print("getting dataset:", url, "file:", filename)
    if force_refresh or not os.path.isfile(filename):
        return get_from_url(url, "DATA/"+filename, verbose=verbose, **kwargs)
    return pd.read_csv("DATA/" + filename)



##### datasets

In [5]:

DATASETS = [
    {
        "name": "Gender wage gap",

        "url": "https://sdmx.oecd.org/public/rest/data/OECD.WISE.WDP,DSD_HSL@DF_HSL_CWB,1.1/.2_2...._T./all?startPeriod=2010&dimensionAtObservation=AllDimensions&format=csvfile",
        "force_csv": False,
        "filename": "gender_wage_gap.csv",
    },
    {
        "name": "Productivity levels",
        "url": "https://sdmx.oecd.org/public/rest/data/OECD.SDD.TPS,DSD_PDB@DF_PDB_LV,1.0/.A.......?startPeriod=2020&dimensionAtObservation=AllDimensions&format=csvfile",
        "force_csv": False,
        "filename": "productivity_levels.csv"
    },
]

In [49]:
#| label: tables
#| layout-ncol: 1
GENDER_WAGE_GAP = (get_data(**DATASETS[0]).drop(columns=['DATAFLOW', 'MEASURE', 'AGE', 'SEX', 'DOMAIN', 'UNIT_MULT', 'DECIMALS'])
                                         .rename(columns = {'OBS_VALUE': 'GENDER_WAGE_GAP'}))
PRODUCTIVITY = get_data(**DATASETS[1]).drop(columns=['DATAFLOW'])
GDPPOP = (PRODUCTIVITY[PRODUCTIVITY.MEASURE == 'GDPPOP']
                                   .rename(columns = {'OBS_VALUE': 'GDPPOP'}))
                                   


display(GENDER_WAGE_GAP[['TIME_PERIOD','REF_AREA',  'UNIT_MEASURE', 'GENDER_WAGE_GAP']])
display(GDPPOP[['TIME_PERIOD', 'REF_AREA', 'UNIT_MEASURE', 'GDPPOP']].sort_values('UNIT_MEASURE'))


Unnamed: 0,TIME_PERIOD,REF_AREA,UNIT_MEASURE,GENDER_WAGE_GAP
0,2010,AUS,PT_WG_SAL_M_D,14.042934
1,2011,AUS,PT_WG_SAL_M_D,15.966387
2,2012,AUS,PT_WG_SAL_M_D,13.750000
3,2013,AUS,PT_WG_SAL_M_D,18.000000
4,2014,AUS,PT_WG_SAL_M_D,15.773354
...,...,...,...,...
449,2018,ROU,PT_WG_SAL_M_D,3.496503
450,2019,ROU,PT_WG_SAL_M_D,5.855397
451,2020,ROU,PT_WG_SAL_M_D,3.304181
452,2021,ROU,PT_WG_SAL_M_D,5.755499


Unnamed: 0,TIME_PERIOD,REF_AREA,UNIT_MEASURE,GDPPOP
155,2023,SVN,USD_PPP_PS,5.395006e+04
140,2022,NLD,USD_PPP_PS,7.715212e+04
141,2020,NOR,USD_PPP_PS,6.711653e+04
142,2021,NOR,USD_PPP_PS,8.898941e+04
143,2022,NOR,USD_PPP_PS,1.231530e+05
...,...,...,...,...
2668,2024,DNK,XDC_PS,4.897060e+05
2667,2023,DNK,XDC_PS,4.687182e+05
2666,2023,DNK,XDC_PS,4.209611e+05
2679,2022,ISL,XDC_PS,9.215931e+06


In [50]:
DATA = GENDER_WAGE_GAP.merge(GDPPOP,
                            how='inner',
                            on=['TIME_PERIOD', 'REF_AREA'])
display(DATA)

Unnamed: 0,REF_AREA,UNIT_MEASURE_x,EDUCATION_LEV,TIME_PERIOD,GENDER_WAGE_GAP,OBS_STATUS_x,BASE_PER_x,FREQ,MEASURE,ACTIVITY,UNIT_MEASURE_y,PRICE_BASE,TRANSFORMATION,ADJUSTMENT,CONVERSION_TYPE,GDPPOP,OBS_STATUS_y,UNIT_MULT,BASE_PER_y,DECIMALS
0,AUS,PT_WG_SAL_M_D,_T,2020,10.533333,A,_Z,A,GDPPOP,_T,XDC_PS,V,_Z,_Z,_Z,81320.980638,A,0,,2
1,AUS,PT_WG_SAL_M_D,_T,2020,10.533333,A,_Z,A,GDPPOP,_T,USD_PPP_PS,V,_Z,_Z,_Z,56821.622919,A,0,,2
2,AUS,PT_WG_SAL_M_D,_T,2020,10.533333,A,_Z,A,GDPPOP,_T,XDC_PS,Q,_Z,_Z,_Z,81320.980638,A,0,2020.0,2
3,AUS,PT_WG_SAL_M_D,_T,2021,10.470245,A,_Z,A,GDPPOP,_T,XDC_PS,V,_Z,_Z,_Z,90725.778508,A,0,,2
4,AUS,PT_WG_SAL_M_D,_T,2021,10.470245,A,_Z,A,GDPPOP,_T,USD_PPP_PS,V,_Z,_Z,_Z,65002.659136,A,0,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,ROU,PT_WG_SAL_M_D,_T,2021,5.755499,A,_Z,A,GDPPOP,_T,USD_PPP_PS,V,_Z,_Z,_Z,39719.244865,A,0,,2
399,ROU,PT_WG_SAL_M_D,_T,2021,5.755499,A,_Z,A,GDPPOP,_T,XDC_PS,Q,_Z,_Z,_Z,58982.373959,A,0,2020.0,2
400,ROU,PT_WG_SAL_M_D,_T,2022,13.636025,A,_Z,A,GDPPOP,_T,XDC_PS,V,_Z,_Z,_Z,72926.710087,A,0,,2
401,ROU,PT_WG_SAL_M_D,_T,2022,13.636025,A,_Z,A,GDPPOP,_T,USD_PPP_PS,V,_Z,_Z,_Z,44292.125784,A,0,,2


In [47]:
GDP_UNITS = GDPPOP['UNIT_MEASURE'].drop_duplicates()
GDP_BASE_PERIODS = GDPPOP['BASE_PER'].drop_duplicates()
display(GDP_UNITS)
display(GDP_BASE_PERIODS)

0         XDC_PS
87    USD_PPP_PS
Name: UNIT_MEASURE, dtype: str

0          NaN
2654    2020.0
Name: BASE_PER, dtype: float64