# DFx ETL Pipeline

## api.worldbank.org

An ETL pipeline for the [Indicator API](https://datahelpdesk.worldbank.org/knowledgebase/topics/125589-developer-information) from the World Bank.

### Libraries

In [None]:
import pandas as pd
from tqdm import tqdm

from dfpp.sources import worldbank_org as source
from dfpp.storage import LocalStorage as Storage

storage = Storage()

### Extract

In [None]:
df_metadata = source.get_series_metadata(per_page=100)
print("Shape:", df_metadata.shape)
display(df_metadata.head())

In [None]:
data = []
for series_id in tqdm(df_metadata["id"].sample(5)):
    try:
        df = source.get_series_data(series_id)
    except Exception as error:
        print(series_id, error)
        continue
    if df is not None:
        data.append(df)
df_raw = pd.concat(data, axis=0, ignore_index=True)
print("Shape:", df_raw.shape)
display(df_raw.head())

### Transform

In [None]:
df_transformed = source.transform(df_raw)
print("Shape:", df_transformed.shape)
display(df_transformed.head())

### Load

In [None]:
for series_id, df in tqdm(df_transformed.groupby("series_id")):
    df.name = series_id
    storage.publish_dataset(df, folder_path=source.SOURCE_NAME)