In [2]:
from io import BytesIO
import country_converter as coco
import pandas as pd
import string

from dfpp.storage import StorageManager
from dfpp.publishing import publish_series

from dfpp.transformation.column_name_template import (
    CANONICAL_COLUMN_NAMES,
    SERIES_PROPERTY_PREFIX,
    DIMENSION_COLUMN_PREFIX,
    SexEnum,
    sort_columns_canonically,
    ensure_canonical_columns,
)

cc = coco.CountryConverter()
BASE_URL = "https://www.healthdata.org/"
FILE_PATH = "backup/sources/IHME-GBD_2021_DATA-c13547d7-1.csv"
SOURCE_NAME = "healthdata_org"

In [3]:
async with StorageManager() as storage_manager:
    series_bytes = await storage_manager.read_blob(path=FILE_PATH)
    df_source = pd.read_csv(BytesIO(series_bytes))

2024-11-13 23:46:21,175 - INFO - _universal - Request URL: 'https://sdgigeneralstorage.blob.core.windows.net/dfx-etl-pipeline-dev/backup/sources/IHME-GBD_2021_DATA-c13547d7-1.csv?sv=REDACTED&st=REDACTED&se=REDACTED&sr=REDACTED&sp=REDACTED&sig=REDACTED'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': 'b8917342-a208-11ef-8bcd-acbc327b2b3d'
    'User-Agent': 'azsdk-python-storage-blob/12.21.0 Python/3.11.9 (macOS-10.16-x86_64-i386-64bit)'
No body was attached to the request
2024-11-13 23:46:21,547 - INFO - _universal - Response status: 200
Response headers:
    'Content-Length': '43483471'
    'Content-Type': 'text/csv'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Wed, 13 Nov 2024 21:01:53 GMT'
    'Accept-Ranges': 'REDACTED'
    'Etag': '"0x8DD042666B4C00A"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'a4bac7fb-901e-0033-5115-36f0

In [None]:
df = df_source.copy()
df["source"] = BASE_URL
df["alpha_3_code"] = cc.pandas_convert(df["location_name"], to="ISO3")

df["series_name"] = df["measure_name"] + ", " + df["cause_name"]

df["series_id"] = (
    df["series_name"]
    .str.replace(f"[{string.punctuation}]", "", regex=True)
    .str.replace(" ", "_")
    .str.lower()
)

df.rename(
    columns={
        "val": "value",
        "metric_name": SERIES_PROPERTY_PREFIX + "unit",
        "sex_name": DIMENSION_COLUMN_PREFIX + "sex",
        "age_name": DIMENSION_COLUMN_PREFIX + "age",
        "cause_name": DIMENSION_COLUMN_PREFIX + "cause",
    },
    inplace=True,
)

df[DIMENSION_COLUMN_PREFIX + "sex"] = df[DIMENSION_COLUMN_PREFIX + "sex"].map(
    {
        "Male": SexEnum.MALE.value,
        "Female": SexEnum.FEMALE.value,
        "Both sexes": SexEnum.BOTH.value,
        "All sexes": SexEnum.TOTAL.value,
    }
)

disagr_columns = [
    col
    for col in df.columns
    if col.startswith(DIMENSION_COLUMN_PREFIX) and col not in CANONICAL_COLUMN_NAMES
]

property_columns = [
    col
    for col in df.columns
    if col.startswith(SERIES_PROPERTY_PREFIX) and col not in CANONICAL_COLUMN_NAMES
]


df = ensure_canonical_columns(df)
df = df[CANONICAL_COLUMN_NAMES + disagr_columns + property_columns]
df = sort_columns_canonically(df)
assert df.drop("value", axis=1).duplicated().sum() == 0



In [7]:
for series_id, df_series in df.groupby("series_id"):
    await publish_series(series_id, df_series, source_folder=SOURCE_NAME)

2024-11-13 23:51:16,676 - INFO - _universal - Request URL: 'https://sdgigeneralstorage.blob.core.windows.net/dfx-etl-pipeline-dev/test/healthdata_org/parquet/deaths_cardiovascular_diseases.parquet?sv=REDACTED&st=REDACTED&se=REDACTED&sr=REDACTED&sp=REDACTED&sig=REDACTED'
Request method: 'PUT'
Request headers:
    'Content-Length': '654236'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '68b2c118-a209-11ef-8bcd-acbc327b2b3d'
    'User-Agent': 'azsdk-python-storage-blob/12.21.0 Python/3.11.9 (macOS-10.16-x86_64-i386-64bit)'
A body is sent with the request
2024-11-13 23:51:17,834 - INFO - _universal - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Wed, 13 Nov 2024 21:51:17 GMT'
    'Etag': '"0x8DD042D4DA073C9"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0