# DFx ETL Pipeline

## ETL

A generic notebook to execure ETL pipelines for supported data sources.

### Libraries

In [1]:
import logging

from dfx_etl.storage import get_storage
from dfx_etl.pipelines import unaids_org as source, Pipeline  # change the source as needed

### Configuration

In [2]:
# set the logging level
logging.basicConfig(
    level=logging.WARNING,  # logging.INFO or logging.WARNING
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()],
)

In [3]:
# get the storage object automatically
storage = get_storage()
storage

AzureStorage(account_name='dfx', container_name='etl-pipeline')

In [4]:
# specify pipeline details
pipeline = Pipeline(
    url="https://kpatlas.unaids.org",
    retriever=source.Retriever(storage=storage),
    transformer=source.Transformer(),
    storage=storage,
)

### Extract

In [5]:
pipeline.retrieve()
print("Shape:", pipeline.df_raw.shape)
display(pipeline.df_raw.head())

Shape: (50977, 11)


Unnamed: 0,Indicator,Unit,Subgroup,Area,Area ID,Time Period,Source,Data value,Formatted,Data_Denominator,Footnote
0,Adolescents and young men who have sex with me...,Number,15-19,Afghanistan,AFG,2019,UNAIDS_GAM_,24600.0,,0.0,Source: Johnston LG Nguyen VK Balakrishnan S L...
1,Adolescents and young men who have sex with me...,Number,15-19,Angola,AGO,2019,UNAIDS_GAM_,22500.0,,0.0,Source: Johnston LG Nguyen VK Balakrishnan S L...
2,Adolescents and young men who have sex with me...,Number,15-19,Albania,ALB,2019,UNAIDS_GAM_,1000.0,,0.0,Source: Johnston LG Nguyen VK Balakrishnan S L...
3,Adolescents and young men who have sex with me...,Number,15-19,United Arab Emirates,ARE,2019,UNAIDS_GAM_,140.0,,0.0,Source: Johnston LG Nguyen VK Balakrishnan S L...
4,Adolescents and young men who have sex with me...,Number,15-19,Argentina,ARG,2019,UNAIDS_GAM_,25400.0,,0.0,Source: Johnston LG Nguyen VK Balakrishnan S L...


### Transform

In [6]:
pipeline.transform()
print("Shape:", pipeline.df_transformed.shape)
display(pipeline.df_transformed.head())

Shape: (10864, 5)


Unnamed: 0,indicator_name,country_code,year,value,source
0,Antiretroviral therapy coverage among men who ...,AUS,2016,78.0,https://kpatlas.unaids.org/
1,Antiretroviral therapy coverage among men who ...,AUT,2016,74.0,https://kpatlas.unaids.org/
2,Antiretroviral therapy coverage among men who ...,AUT,2020,90.2,https://kpatlas.unaids.org/
3,Antiretroviral therapy coverage among men who ...,AUT,2021,94.089974,https://kpatlas.unaids.org/
4,Antiretroviral therapy coverage among men who ...,AUT,2022,93.3,https://kpatlas.unaids.org/


### Validate

In [7]:
pipeline.validate()
print("Shape:", pipeline.df_validated.shape)
display(pipeline.df_validated.head())

Shape: (10864, 5)


Unnamed: 0,indicator_name,country_code,year,value,source
0,Antiretroviral therapy coverage among men who ...,AUS,2016,78.0,https://kpatlas.unaids.org/
1,Antiretroviral therapy coverage among men who ...,AUT,2016,74.0,https://kpatlas.unaids.org/
2,Antiretroviral therapy coverage among men who ...,AUT,2020,90.2,https://kpatlas.unaids.org/
3,Antiretroviral therapy coverage among men who ...,AUT,2021,94.089974,https://kpatlas.unaids.org/
4,Antiretroviral therapy coverage among men who ...,AUT,2022,93.3,https://kpatlas.unaids.org/


### Load

In [8]:
pipeline.load();