# DFx Indicators ETL

## Database

This notebook is used to read ETL outputs and ingest the data into the database used by the DFx Indicators API.

### Libraries

In [1]:
import dfx_etl as dfx
import pandas as pd

### Configuration

In [2]:
# Set to the name of the versioned folder in the storage to pull the data from
DATA_VERSION = "v26-01-06"

engine = dfx.get_engine()
storage = dfx.get_storage()

`DB_CONN` is not configured. Using an in-memory SQLite database.


In [3]:
# Does nothing when tables exist
dfx.create_tables(engine)

['country', 'indicator', 'dimension', 'series']

### Data Preparation

In [4]:
# Read and concatenate all data
df_data = storage.read_dataset(DATA_VERSION)
print("Shape:", df_data.shape)
display(df_data.head())

Shape: (11718805, 7)


Unnamed: 0,year,value,country_code,indicator_name,provider,dimension,source
0,2017,1.7,AFG,Installed electricity capacity by country/area...,energydata_info,On-grid Solar photovoltaic; On-grid,
1,2018,1.7,AFG,Installed electricity capacity by country/area...,energydata_info,On-grid Solar photovoltaic; On-grid,
2,2019,11.7,AFG,Installed electricity capacity by country/area...,energydata_info,On-grid Solar photovoltaic; On-grid,
3,2020,11.7,AFG,Installed electricity capacity by country/area...,energydata_info,On-grid Solar photovoltaic; On-grid,
4,2011,0.12,AFG,Installed electricity capacity by country/area...,energydata_info,Off-grid Solar photovoltaic; Off-grid,


In [5]:
# Prepare data for the indicator table
columns = {"indicator_name": "name", "provider": "provider"}
df_indicators = df_data.reindex(columns=columns).rename(columns=columns)
df_indicators.drop_duplicates(ignore_index=True, inplace=True)
df_indicators.sort_values(["provider", "name"], ignore_index=True, inplace=True)
print("Shape:", df_indicators.shape)
display(df_indicators.head())

Shape: (4078, 2)


Unnamed: 0,name,provider
0,Installed electricity capacity by country/area...,energydata_info
1,Number of Deaths,healthdata_ghdx
2,Number of Prevalence,healthdata_ghdx
3,Rate of Deaths,healthdata_ghdx
4,Rate of Prevalence,healthdata_ghdx


In [6]:
# Prepare data for the dimension table
columns = {"dimension": "name"}
df_dimensions = df_data.reindex(columns=columns).rename(columns=columns)
df_dimensions.drop_duplicates(ignore_index=True, inplace=True)
df_dimensions.sort_values(list(columns.values()), ignore_index=True, inplace=True)
print("Shape:", df_dimensions.shape)
display(df_dimensions.head())

Shape: (40359, 1)


Unnamed: 0,name
0,0-59; _T; BOTHSEX; ALLAREA; _T
1,1-14
2,1-4
3,10+; FEMALE; ALLAREA
4,10+; FEMALE; RURAL


### Data Ingestion

In [7]:
# Ingest the data ignoring conflicts
df_indicators.to_sql(
    "indicator",
    con=engine,
    if_exists="append",
    index=False,
    method=dfx.ignore_on_conflict,
)
df_dimensions.to_sql(
    "dimension",
    con=engine,
    if_exists="append",
    index=False,
    method=dfx.ignore_on_conflict,
)

40359

In [8]:
# Prepare data for the series table
df_countries = pd.read_sql(
    "SELECT id AS country_id, iso_3 AS country_code FROM country;", con=engine
)
df_indicators = pd.read_sql(
    "SELECT id AS indicator_id, name AS indicator_name FROM indicator;", con=engine
)
df_dimensions = pd.read_sql(
    "SELECT id AS dimension_id, name AS dimension FROM dimension;", con=engine
)

print("Shape before:", df_data.shape)
columns = ["country_id", "indicator_id", "dimension_id", "year", "value"]
df_series = df_data.merge(df_countries, how="inner", on="country_code")
df_series = df_series.merge(df_indicators, how="inner", on="indicator_name")
df_series = df_series.merge(df_dimensions, how="inner", on="dimension")
df_series = df_series.reindex(columns=columns).sort_values(
    columns[:-1], ignore_index=True
)
print("Shape after:", df_series.shape)
display(df_series.head())

Shape before: (11718805, 7)
Shape after: (11718805, 5)


Unnamed: 0,country_id,indicator_id,dimension_id,year,value
0,4,1,5205,2011,295.0
1,4,1,5205,2012,295.0
2,4,1,5205,2013,295.0
3,4,1,5205,2014,295.0
4,4,1,5205,2015,295.0


In [9]:
df_series.to_sql(
    "series",
    con=engine,
    if_exists="append",
    index=False,
    chunksize=1024,  # Batch the writes
    method=dfx.update_on_conflict,
)

11718805

In [10]:
# Sanity check
pd.read_sql(
    "SELECT COUNT(*), indicator_provider FROM observation GROUP BY indicator_provider;",
    con=engine,
)

Unnamed: 0,COUNT(*),indicator_provider
0,92384,energydata_info
1,138040,healthdata_ghdx
2,2650192,ilo_sdmx_api
3,255745,imf_datamapper_api
4,11996,sipri_milex
5,10864,unaids_kpatlas
6,898077,unicef_sdmx_api
7,1149610,unstats_sdg_database
8,4779665,who_gho_api
9,1732232,world_bank_wdi
