# Set up

## Check configuration
Should return path to correct python version (from virtual environment)

In [None]:
import sys
print(sys.executable)
# print('\n'.join(sys.path[:6]))

## Load libraries

In [None]:
# Automatically reload modules before execution of each cell
# so when you edit src/mypackage/*.py in your editor and rerun cells, 
# changes appear immediately.
%reload_ext autoreload
%autoreload 2

# python
from __future__ import annotations

# Standard library
from pathlib import Path

# Third-party
from pysdmx.model import FixedValueMap, ImplicitComponentMap, ValueMap, MultiValueMap, ComponentMap
from openpyxl import Workbook, load_workbook
import pandas as pd
import pysdmx as px
import pickle as pkl
from datetime import datetime


# Custom
## Functions
from tidysdmx import (
    filter_tidy_raw, 
    validate_dataset_local, 
    map_structures, 
    apply_fixed_value_maps, 
    apply_implicit_component_maps, 
    build_date_pattern_map,
    build_value_map_list,
    build_multi_value_map_list,
    build_representation_map,
    build_single_component_map,
    extract_component_ids,
    write_excel_mapping_template,
    build_structure_map,
    create_schema_from_table,
    build_structure_map_from_template_wb,
    standardize_output,
    parse_mapping_template_wb
)


## Define globals

In [None]:
# CAUTION! FOR TESTING ONLY. DO NOT USE IN PRODUCTION.
# os.environ["PYTHONHTTPSVERIFY"] = "0"

# FMR and artefacts information
fmr_url = "https://fmrqa.worldbank.org/FMR/sdmx/v2"
# raw schema
# raw_structure_agency = "WB"
# raw_structure_id = "IFPRI_ASTI"
# raw_structure_version = "1.0"
# dissemination schema
dis_structure_agency = "WB.GGH.HSP"
dis_structure_id = "DS_ASPIRE"
dis_structure_version = "1.0.0"
# structure map
# raw_structure_map = "SM_IFPRI_ASTI_TO_DATA360"

# Path to raw data
path_to_raw_data = Path(
    "../TMP/data/WB_ASPIRE"
)
path_to_xlsx_mapping = Path(
    "./data/WB_ASPIRE_MODIFIED_COPY.xlsx"
)

## Initiate API client

In [None]:
print(fmr_url)
client = px.api.fmr.RegistryClient(fmr_url)
client

# STEP 1 - Load raw data

Here we are loading the raw dataset as provided from the source. In this demonstration notebook, the raw data is simply being loaded from file, but in the final pipeline, the provenance of the file should be fully documented in a configuration file, and read from the source / DDH possible.

In [None]:
def read_raw_data(folder_path):
    """Read all .csv files from a folder and return a single pandas DataFrame.

    Parameters
    ----------
    folder_path : str | pathlib.Path
        Path to the folder containing CSV files.

    Returns:
    -------
    pandas.DataFrame
        Concatenated DataFrame of all CSVs (same structure assumed), with a
        "Series code" column indicating the source filename (without .csv).
    """
    folder = Path(folder_path)
    csv_files = sorted(folder.glob("*.csv"))
    if not csv_files:
        raise ValueError(f"No CSV files found in folder: {folder}")

    dfs = []
    for f in csv_files:
        df = pd.read_csv(f)
        df["Series code"] = f.stem  # filename without extension
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

raw_df = read_raw_data(path_to_raw_data)
raw_df.head()

# STEP 2: Reshape raw data

A critical step of this opinionated pipeline framework is to systematically reshape raw into tidy format (one observation per row). For more information about tidy data, please refer to [Hadley Wickam's original paper](https://vita.had.co.nz/papers/tidy-data.pdf). 

This step is critical because once data has been reshaped into a tidy format, the rest of the pipeline can be fully standradized, bringing immediate maintenance, scalability, and insititutional knowledge benefits. 

This is also a good place to implement minimal data cleaning if necessary.

In [None]:
def reshape_raw_data(df: pd.DataFrame) -> pd.DataFrame:
    """Reshape raw data and implements basic data cleaning.

    It 'melts' (unpivots) columns starting with 'data.' into two columns ('name' and 'value'),
    and then cleans the 'name' column by removing the 'data.' prefix.

    Args:
        df: The input pandas DataFrame containing columns like 'data.1', 'data.2', etc.

    Returns:
        A new DataFrame in the longer format.
    """
    # 1. Equivalent of R's pivot_longer (using melt)
    # Selects columns starting with 'data.' for unpivoting
    data_cols = df.filter(like='YR').columns.tolist()

    df_lg = df.melt(
        id_vars=[col for col in df.columns if col not in data_cols], # Keep all non-data columns as identifier variables
        var_name='year',    # New column for the original column names
        value_name='value', # New column for the values
    )

    # 2. Equivalent of R's stringr::str_replace
    # Removes the 'data.' prefix from the 'name' column
    df_lg['year'] = df_lg['year'].str.replace('YR', '', regex=False)

    # Filter out rows with missing values in 'value'
    df_lg = df_lg.dropna(subset=['value'])
    
    return df_lg

tidy_raw_df = reshape_raw_data(raw_df)
tidy_raw_df.head()

# STEP 3: Describe the tidy raw data input

We will describe the tidy raw data input using an SDMX schema. This description will allow for early validation of our input data during subsequent run of the pipelines for data updates. 

The `create_schema_from_table()` helper function allows pipeline developers to create pysdmx schema object automatically with minimal inputs from the pipeline developers.

In [None]:
tidy_raw_schema=create_schema_from_table(
    tidy_raw_df, 
    dimensions=["Series code", "economy"],
    time_dimension="year", 
    measure="value")

tidy_raw_schema.components["Series code"]

# STEP 4: Filter out unnecessary rows (Optional)

In [None]:
# def apply_constraints(df: pd.DataFrame, constraints: Dict[str, List]) -> pd.DataFrame:
#     """Filters a DataFrame based on a dictionary of column names and valid values.
    
#     Args:
#         df (pd.DataFrame): The source dataframe.
#         constraints (dict): A dict where keys are column names and values are 
#                         lists of valid entries to keep (e.g., {'col': ['val1', 'val2']}).
                        
#     Returns:
#         pd.DataFrame: A filtered copy of the original dataframe.
#     """
#     for column, valid_values in constraints.items():
#         # strict check: ensure column exists to avoid KeyErrors
#         if column in df.columns:
#             df = df[df[column].isin(valid_values)]
#         else:
#             print(f"Warning: Column '{column}' not in DataFrame. Skipping.")
            
#     return df


# constraints = {
#     "Series code": ["per_allsp.adq_ep_preT_tot", "per_allsp.adq_ep_tot", "per_allsp.adq_pop_preT_tot"]#,
#     # "TIME_PERIOD": ["1992"],
#     # "AREA": ["GHA"]
# }


# tidy_raw_df=apply_constraints(tidy_raw_df, constraints)

# STEP 5: Create structure map

## Fetch dissemination schema

In [None]:
dis_schema = client.get_schema("datastructure", agency=dis_structure_agency, id=dis_structure_id, version=dis_structure_version)
dis_schema

## Create structure map from mapping template

In [None]:
mappings = parse_mapping_template_wb(path_to_xlsx_mapping)
sm=build_structure_map_from_template_wb(mappings)
sm.maps

# Map data to dissemination schema

## Implement mapping

In [None]:
mapped = map_structures(df = tidy_raw_df, structure_map = sm)

## Standardize output for upload

In [None]:
artefact_id=dis_structure_agency + ":" + dis_structure_id + "(" + dis_structure_version + ")"

out = standardize_output(
    df=mapped,
    artefact_id=artefact_id,
    schema=dis_schema,
    action="I"
    )

# STEP 6: Final validation

In [None]:
dis_errors = validate_dataset_local(df = out, schema = dis_schema)#, sdmx_cols=[])
dis_errors

# Testing section

In [None]:
errors = mapped[mapped['INDICATOR'].isna()][['Series code', 'INDICATOR']].drop_duplicates(subset=['Series code'])
errors.to_csv("wb_aspire_mapping_errors.csv")

In [None]:
unique_indicators = tidy_raw_df[['Series code']].drop_duplicates()