In [1]:

cd ../../../../

/Users/mykhailoslukvin/repo/dv-data-pipeline


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import asyncio
import io
from io import BytesIO
import json
import os
from collections import defaultdict
from pprint import pprint as print

import aiohttp
import pandas as pd
import requests
from tqdm.asyncio import tqdm_asyncio

from dfpp.storage import StorageManager
from dfpp.transformation.column_name_template import SexEnum

MAX_CONCURRENCY = 10

In [3]:
async with StorageManager() as storage_manager:
    path = os.path.join(storage_manager.utilities_path, "sep5_country_lookup.xlsx")
    data = await storage_manager.read_blob(path=path)
    df_country_map = pd.read_excel(BytesIO(data), sheet_name="country_lookup")
    df_region_map = pd.read_excel(BytesIO(data), sheet_name="region_lookup")

df_country_map = df_country_map[df_country_map["Numeric code"].notna()]
df_country_map["Numeric code"] = df_country_map["Numeric code"].astype(int)

df_region_map = df_region_map[df_region_map["Numeric code"].notna()]
df_region_map["Numeric code"] = df_region_map["Numeric code"].astype(int)

iso_3_region = dict(df_region_map[["Numeric code", "Alpha-3 code"]].values)
iso_3_country = dict(df_country_map[["Numeric code", "iso3"]].values)

iso_3_country.update(iso_3_region)

iso_3_map = iso_3_country

In [4]:
sex_remap = {
    "BOTHSEX": SexEnum.BOTH.value,
    "MALE": SexEnum.MALE.value,
    "FEMALE": SexEnum.FEMALE.value,
}

age_remap = {"ALLAGE", "all"}

In [5]:
def get_indicators():
    url = "https://unstats.un.org/sdgapi/v1/sdg/CompareTrends/GetDisaggregatedGlobalAndRegional"

    headers = {"Accept": "application/json"}

    response = requests.post(url, headers=headers)
    response.raise_for_status()

    data = response.json()
    return data

In [6]:
indicators = get_indicators()

In [7]:
series_codes = set([i["seriesCode"] for i in indicators])

In [8]:
async def fetch_data(session, series_id):
    url = f"https://unstats.un.org/sdgapi/v1/sdg/Series/Data"
    params = {"seriesCode": series_id, "page": 1, "pageSize": 10000000}
    headers = {"Accept": "application/json"}

    async with session.get(url, headers=headers, params=params) as response:
        response.raise_for_status()
        return await response.json()


async def process_series(series_id, session):
    year_data = await fetch_data(session, series_id)

    dimensions = [d["id"] for d in year_data["dimensions"]]
    size = year_data["totalElements"]
    return series_id, year_data["data"], dimensions, size


async def get_series_data_and_dimensions(series_codes):
    series_data_map = defaultdict()
    series_map = defaultdict(dict)

    async with aiohttp.ClientSession() as session:
        tasks = []
        for row in series_codes:
            series_id = row
            tasks.append(process_series(series_id, session))

        results = await asyncio.gather(*tasks)

        for series_id, data, dimensions, size in results:
            series_data_map[series_id] = data
            series_map[series_id]["dimensions"] = dimensions
            series_map[series_id]["totalElements"] = size

    return series_data_map, series_map

In [9]:
async def process_series_id(
    series_id, age_remap, sex_remap, iso_3_map, storage_manager, semaphore
):
    try:
        async with semaphore:
            series_data_map, dimension_map = await get_series_data_and_dimensions([series_id])
            df_source = pd.DataFrame(series_data_map[series_id])
            df = df_source.copy()

            assert df.shape[0] > 0, "DataFrame is empty"
            assert df.series.value_counts(dropna=False).shape[0] == 1, "Multiple series values found"

            print(f"Series description: {df.seriesDescription.iloc[0]}")

            assert df.shape[0] == dimension_map[series_id]["totalElements"], "Shape mismatch with expected dimensions"

            df_dimensions = pd.json_normalize(df["dimensions"])
            df_attributes = pd.json_normalize(df["attributes"])

            df = pd.concat([df.drop(columns=["dimensions"]), df_dimensions], axis=1)
            df = pd.concat([df.drop(columns=["attributes"]), df_attributes], axis=1)

            df.rename(
                columns={
                    "geoAreaCode": "alpha_3_code",
                    "geoAreaName": "country_or_area",
                    "timePeriodStart": "year",
                },
                inplace=True,
            )

            original_dimension_columns = df_dimensions.columns
            dimension_columns = original_dimension_columns.str.lower().str.replace(
                "\s", "_", regex=True
            )
            dimension_column_rename_map = dict(
                zip(original_dimension_columns, dimension_columns)
            )

            df.rename(columns=dimension_column_rename_map, inplace=True)

            df_selection = df.copy()[
                ["alpha_3_code", "country_or_area", "year"] + dimension_columns.tolist()
            ]

            if "age" in df_selection.columns:
                df_selection["age"] = df_selection["age"].replace(age_remap)

            if "sex" in df_selection.columns:
                df_selection["sex"] = df_selection["sex"].replace(sex_remap)

            df_selection["alpha_3_code"] = df_selection["alpha_3_code"].astype(int)
            df_selection["alpha_3_code"] = df_selection["alpha_3_code"].replace(iso_3_map)

            with io.BytesIO() as output_buffer:
                df_selection.to_excel(output_buffer, index=False, engine="openpyxl")
                output_buffer.seek(0)

                path_to_save = os.path.join(
                    storage_manager.test_path, "unstats_un_org", f"{series_id}.xlsx"
                )
                blob_client = storage_manager.container_client.get_blob_client(
                    blob=path_to_save
                )

                await blob_client.upload_blob(data=output_buffer.getvalue(), overwrite=True)
    except Exception as e:
        return series_id, e

async def process_all_series(
    series_codes, age_remap, sex_remap, iso_3_map, max_concurrent_tasks=MAX_CONCURRENCY
):

    semaphore = asyncio.Semaphore(max_concurrent_tasks)
    failed_series = []

    async with StorageManager() as storage_manager:
        tasks = [
            process_series_id(
                series_id, age_remap, sex_remap, iso_3_map, storage_manager, semaphore
            )
            for series_id in series_codes
        ]

        for future in tqdm_asyncio.as_completed(tasks):
            result = await future
            if isinstance(result, tuple) and len(result) == 2:
                failed_series.append(result)

    return failed_series


In [10]:
failed_series = await process_all_series(
    series_codes, age_remap, sex_remap, iso_3_map, max_concurrent_tasks=MAX_CONCURRENCY
)

  0%|          | 0/426 [00:00<?, ?it/s]

('Series description: Number of least developed countries and small island '
 'developing States with nationally determined contributions (Number)')
('Series description: Number of conflict-related deaths (unknown), by sex, age '
 'and cause of death (Number)')


  df_selection["age"] = df_selection["age"].replace(age_remap)


'Series description: Extent of human made wetlands (square kilometres)'
('Series description: Proportion of countries with clearly defined procedures '
 'in law or policy for participation by service users/communities in planning '
 'program in rural drinking-water supply (%)')


  df_selection["age"] = df_selection["age"].replace(age_remap)
  0%|          | 2/426 [00:07<21:33,  3.05s/it]

('Series description: Proportion of women who make their own informed '
 'decisions regarding contraceptive use (% of women aged 15-49 years)')
('Series description: Proportion of women who make their own informed '
 'decisions regarding reproductive health care (% of women aged 15-49 years)')


  df_selection["age"] = df_selection["age"].replace(age_remap)
  df_selection["age"] = df_selection["age"].replace(age_remap)
  1%|▏         | 6/426 [00:07<04:28,  1.56it/s]

('Series description: Proportion of women aged 20-24 years who were married or '
 'in a union before age 18 (%)')


  df_selection["age"] = df_selection["age"].replace(age_remap)
  2%|▏         | 8/426 [00:12<10:57,  1.57s/it]

('Series description: Number of conflict-related deaths (non-civilians), by '
 'sex, age and cause of death (Number)')


  df_selection["age"] = df_selection["age"].replace(age_remap)
  2%|▏         | 9/426 [00:12<08:41,  1.25s/it]

('Series description: Number of conflict-related deaths (civilians), by sex, '
 'age and cause of death (Number)')


  2%|▏         | 10/426 [00:14<09:54,  1.43s/it]

('Series description: Proportion of project objectives of new development '
 'interventions drawn from country-led result frameworks - data by provider '
 '(%)')
('Series description: Number of chairs of permanent committees, by age sex and '
 'focus of the committee, Joint Committees')


  df_selection["age"] = df_selection["age"].replace(age_remap)
  3%|▎         | 11/426 [00:15<08:44,  1.26s/it]

('Series description: Total financial support provided (Billions of current '
 'United States dollars)')


  3%|▎         | 12/426 [00:16<07:40,  1.11s/it]

'Series description: Electronic waste collected (Tonnes)'


  3%|▎         | 13/426 [00:18<10:07,  1.47s/it]

In [None]:
assert len(failed_series) == 0, print(failed_series) 