# DFx ETL Pipeline

## ilo.org

An ETL pipeline for [ILOSTAT API](https://ilostat.ilo.org/resources/sdmx-tools/) from the International Labour Organization (ILO).

### Libraries

In [None]:
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

from dfpp.sources import ilo_org as source
from dfpp.storage import AzureStorage as Storage

storage = Storage()
PERIOD: tuple[str, str] = ("2015-01-01", "2025-12-31")

### Extract

In [None]:
df_metadata = source.get_series_metadata()
print("Shape:", df_metadata.shape)
display(df_metadata.head())

In [None]:
# subset only some disaggregations and no classification (NOC)
mask = (
    df_metadata["series_code"]
    .str.split("_")
    .str.slice(2, -1)
    .apply(lambda x: not set(x) - source.DISAGGREGATIONS)
)
df_metadata = df_metadata.loc[mask].reset_index(drop=True)
print("Shape:", df_metadata.shape)
display(df_metadata.head())

In [None]:
data = []
for series_code in tqdm(df_metadata["series_code"]):
    try:
        df = source.get_series_data(
            series_code, period_from=PERIOD[0], period_to=PERIOD[1]
        )
    except Exception as error:
        print(series_code, error)
        continue
    if df is None:
        continue
    df["series_code"] = series_code
    data.append(df)
df_raw = pd.concat(data, axis=0, ignore_index=True)
print("Shape:", df_raw.shape)
display(df_raw.head())

### Transform

In [None]:
df_transformed = source.transform(df_raw)
print("Shape:", df_transformed.shape)
display(df_transformed.head())

### Load

In [None]:
for series_id, df in tqdm(df_transformed.groupby("series_id")):
    df.name = series_id
    storage.publish_dataset(df, folder_path=source.SOURCE_NAME)