In [2]:
import pandas as pd
import country_converter as coco


from dfpp.transformation.column_name_template import (
    CANONICAL_COLUMN_NAMES,
    DIMENSION_COLUMN_PREFIX,
    SERIES_PROPERTY_PREFIX,
    sort_columns_canonically,
    ensure_canonical_columns,
)
from dfpp.publishing import publish_series

cc = coco.CountryConverter()

In [3]:
URL = "https://www.sipri.org/sites/default/files/SIPRI-Milex-data-1948-2023.xlsx"
df_source = pd.read_excel(URL, sheet_name="Share of Govt. spending", header=7)

In [19]:
df = df_source.copy()

In [20]:
remap_values = {". .": pd.NA, "xxx": pd.NA, "...": pd.NA} 
df.replace(remap_values, inplace=True)
df.dropna(axis=0, how="all", inplace=True)
df["alpha_3_code"] = cc.pandas_convert(df["Country"], to="ISO3")
df = df[df["alpha_3_code"] != "not found"].reset_index(drop=True)
df = df.melt(id_vars = ["alpha_3_code", "Notes", "Country"], var_name = "year", value_name = "value")
df = df[df["year"] != "Reporting year"].reset_index(drop=True)
df["value"] = df.value.astype("float", errors="ignore")
df[SERIES_PROPERTY_PREFIX + "unit"] = "percentage of government spending"
df["series_name"] = """Military expenditure by country as percentage of government spending, 1948-2023 Â© SIPRI 2023"""
df["source"] = "https://www.sipri.org/"
series_id = "countrywisemilitaryexpenditure_sipri"
df["series_id"] = series_id
to_select_columns = [
    col
    for col in df.columns
    if any(
        [
            col.startswith(DIMENSION_COLUMN_PREFIX),
            col.startswith(SERIES_PROPERTY_PREFIX),
        ]
    )
    and col not in CANONICAL_COLUMN_NAMES
]
df = ensure_canonical_columns(df)
df = df[CANONICAL_COLUMN_NAMES + to_select_columns]
df = sort_columns_canonically(df)





In [21]:
await publish_series(series_id=series_id, df_series=df, source_folder="sipri_org")

2024-11-16 18:44:39,950 - INFO - _universal - Request URL: 'https://sdgigeneralstorage.blob.core.windows.net/dfx-etl-pipeline-dev/test/sipri_org/parquet/countrywisemilitaryexpenditure_sipri.parquet?sv=REDACTED&st=REDACTED&se=REDACTED&sr=REDACTED&sp=REDACTED&sig=REDACTED'
Request method: 'PUT'
Request headers:
    'Content-Length': '51842'
    'x-ms-blob-type': 'REDACTED'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '12a2e392-a43a-11ef-9f21-acbc327b2b3d'
    'User-Agent': 'azsdk-python-storage-blob/12.21.0 Python/3.11.10 (macOS-12.7.5-x86_64-i386-64bit)'
A body is sent with the request


2024-11-16 18:44:42,775 - INFO - _universal - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Sat, 16 Nov 2024 16:44:42 GMT'
    'Etag': '"0x8DD065DF89F93E3"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '1ddff9fa-a01e-0017-4746-380622000000'
    'x-ms-client-request-id': '12a2e392-a43a-11ef-9f21-acbc327b2b3d'
    'x-ms-version': 'REDACTED'
    'x-ms-content-crc64': 'REDACTED'
    'x-ms-request-server-encrypted': 'REDACTED'
    'Date': 'Sat, 16 Nov 2024 16:44:42 GMT'
