In [71]:
import requests
import xml.etree.ElementTree as ET # Library for parsing the XML data we get from the OECD API
import pandas as pd


# API URL with filters (countries selected)
url = "https://sdmx.oecd.org/public/rest/data/OECD.ELS.SAE,DSD_EARNINGS@GENDER_WAGE_GAP,1.0/NOR+NZL+BRA+COL+JPN+GRC+DEU+FRA+USA......_T?startPeriod=2005&endPeriod=2023&dimensionAtObservation=AllDimensions"

# Make the API request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    print("Data retrieved successfully!")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")

Data retrieved successfully!


In [73]:
# Parse the XML content
root = ET.fromstring(response.content)

# Define the namespace to use with XML parsing
ns = {
    'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic',
    'message': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message'
}

In [77]:
# Find the DataSet element
# The DataSet element contains the actual data we need. We’ll find it using the appropriate namespace.
dataset = root.find('.//message:DataSet', ns)

In [79]:
# List to hold the extracted data
data = []

# Iterate over each Obs element in the DataSet
for obs in dataset.findall('generic:Obs', ns):
    obs_data = {}

    # Extract TIME_PERIOD, REF_AREA, GENDER from the ObsKey
    for value in obs.find('generic:ObsKey', ns):
        obs_data[value.attrib['id']] = value.attrib['value']

    # Extract the observation value
    obs_value = obs.find('generic:ObsValue', ns)
    if obs_value is not None:
        obs_data['OBS_VALUE'] = obs_value.attrib['value']
    else:
        print("ObsValue not found for an observation.")
        continue

    # Append the data
    data.append(obs_data)

ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.
ObsValue not found for an observation.


In [83]:
# Check if data is being extracted
print(f"Number of records extracted: {len(data)}")
if len(data) > 0:
    print("Sample data:", data[:5])

    # Convert the list of dictionaries into a pandas DataFrame
    df = pd.DataFrame(data)

    # Display the first few rows of the DataFrame to check the data
    display(df.head())
else:
    print("No data extracted to save.")

Number of records extracted: 401
Sample data: [{'TIME_PERIOD': '2013', 'REF_AREA': 'NZL', 'MEASURE': 'GWP', 'UNIT_MEASURE': 'PT_WG_SAL_M_D', 'PAY_PERIOD': '_Z', 'PRICE_BASE': '_Z', 'AGGREGATION_OPERATION': 'D1', 'SEX': '_T', 'OBS_VALUE': '2.4064171123'}, {'TIME_PERIOD': '2015', 'REF_AREA': 'COL', 'MEASURE': 'GWP', 'UNIT_MEASURE': 'PT_WG_SAL_M_D', 'PAY_PERIOD': '_Z', 'PRICE_BASE': '_Z', 'AGGREGATION_OPERATION': 'MEDIAN', 'SEX': '_T', 'OBS_VALUE': '14.245359161'}, {'TIME_PERIOD': '2013', 'REF_AREA': 'NOR', 'MEASURE': 'GWP', 'UNIT_MEASURE': 'PT_WG_SAL_M_D', 'PAY_PERIOD': '_Z', 'PRICE_BASE': '_Z', 'AGGREGATION_OPERATION': 'D1', 'SEX': '_T', 'OBS_VALUE': '3.4420289855'}, {'TIME_PERIOD': '2006', 'REF_AREA': 'NOR', 'MEASURE': 'GWP', 'UNIT_MEASURE': 'PT_WG_SAL_M_D', 'PAY_PERIOD': '_Z', 'PRICE_BASE': '_Z', 'AGGREGATION_OPERATION': 'D9', 'SEX': '_T', 'OBS_VALUE': '22.122265385'}, {'TIME_PERIOD': '2007', 'REF_AREA': 'NOR', 'MEASURE': 'GWP', 'UNIT_MEASURE': 'PT_WG_SAL_M_D', 'PAY_PERIOD': '_Z', 'PR

Unnamed: 0,TIME_PERIOD,REF_AREA,MEASURE,UNIT_MEASURE,PAY_PERIOD,PRICE_BASE,AGGREGATION_OPERATION,SEX,OBS_VALUE
0,2013,NZL,GWP,PT_WG_SAL_M_D,_Z,_Z,D1,_T,2.4064171123
1,2015,COL,GWP,PT_WG_SAL_M_D,_Z,_Z,MEDIAN,_T,14.245359161
2,2013,NOR,GWP,PT_WG_SAL_M_D,_Z,_Z,D1,_T,3.4420289855
3,2006,NOR,GWP,PT_WG_SAL_M_D,_Z,_Z,D9,_T,22.122265385
4,2007,NOR,GWP,PT_WG_SAL_M_D,_Z,_Z,D9,_T,21.947004608


In [85]:
# Save the DataFrame to a CSV file
df.to_csv('gender_wage_gap_data.csv', index=False)

# Confirmation message
print("Data saved to 'gender_wage_gap_data.csv'")

Data saved to 'gender_wage_gap_data.csv'
