# Overview

The purpose of this notebook is to understand the logic of `transform_source_to_target` developed by Tony, then to improve it and make it more robust.

**Note: This notebook is for development purposes and it is not intended to be used in production nor to be executed sequentially.**

# Setup

In [None]:
# import pysdmx as px

In [None]:
# from pysdmx.io.format import StructureFormat # To extract json format
# from pysdmx.api import fmr # CLient to connect to FMR
# from urllib.parse import urljoin

In [1]:
import pandas as pd
import json 

pd.set_option('display.max_columns', None) 

In [2]:
# import csv file
raw = pd.read_csv("./wb_shp_bronze.csv")
raw.head()

Unnamed: 0,indicator,year,countrycode,period,welftype,spell,subgroup,value
0,WB.SP.meantotal,2008,ALB,2008-2012,CONS,1,National,10.03817
1,WB.SP.meantotal,2012,ALB,2008-2012,CONS,2,National,9.517231
2,WB.SP.meantotal,2014,ALB,2014-2017,CONS,1,National,10.14131
3,WB.SP.meantotal,2017,ALB,2014-2017,CONS,2,National,12.41122
4,WB.SP.meantotal,2013,ARE,2013-2018,INC,1,National,72.450951


In [8]:
# import json mapping file
mapping_file_path = "./master_mapping_wb_shp.json"
with open(mapping_file_path, 'r') as f:
    mapping = json.load(f)
mapping.keys()

dict_keys(['schema_version', 'dsd_id', 'components', 'representation'])

In [23]:
type(mapping)

dict

# transform_source_to_target

## Line by line

In [9]:
# corrupted_mapping
# Delete components from mapping 
no_comp_mapping = mapping.copy()
del no_comp_mapping['components']
print(no_comp_mapping.keys())

dict_keys(['schema_version', 'dsd_id', 'representation'])


In [10]:
# Create an empty DataFrame with columns as defined in components_map['TARGET']
components_map = mapping["components"]
components_map

[{'SOURCE': 'NA', 'TARGET': 'FREQ'},
 {'SOURCE': 'countrycode', 'TARGET': 'REF_AREA'},
 {'SOURCE': 'indicator', 'TARGET': 'INDICATOR'},
 {'SOURCE': 'NA', 'TARGET': 'SEX'},
 {'SOURCE': 'NA', 'TARGET': 'AGE'},
 {'SOURCE': 'NA', 'TARGET': 'URBANISATION'},
 {'SOURCE': 'indicator', 'TARGET': 'UNIT_MEASURE'},
 {'SOURCE': 'welftype', 'TARGET': 'COMP_BREAKDOWN_1'},
 {'SOURCE': 'period', 'TARGET': 'COMP_BREAKDOWN_2'},
 {'SOURCE': 'NA', 'TARGET': 'COMP_BREAKDOWN_3'},
 {'SOURCE': 'year', 'TARGET': 'TIME_PERIOD'},
 {'SOURCE': 'NA', 'TARGET': 'AGG_METHOD'},
 {'SOURCE': 'indicator', 'TARGET': 'UNIT_TYPE'},
 {'SOURCE': 'NA', 'TARGET': 'DECIMALS'},
 {'SOURCE': 'NA', 'TARGET': 'DATABASE_ID'},
 {'SOURCE': 'NA', 'TARGET': 'TIME_FORMAT'},
 {'SOURCE': 'NA', 'TARGET': 'COMMENT_TS'},
 {'SOURCE': 'NA', 'TARGET': 'COMMENT_OBS'},
 {'SOURCE': 'NA', 'TARGET': 'UNIT_MULT'},
 {'SOURCE': 'NA', 'TARGET': 'OBS_STATUS'},
 {'SOURCE': 'NA', 'TARGET': 'DATA_SOURCE'},
 {'SOURCE': 'NA', 'TARGET': 'OBS_CONF'},
 {'SOURCE': 'v

In [14]:
try: 
    no_comp_mapping['components']
except KeyError as e:
    print(f"The mapping file should contain 'components' key. Please make sure the mapping file has this key.")

The mapping file should contain 'components' key. Please make sure the mapping file has this key.


In [15]:
components_map

[{'SOURCE': 'NA', 'TARGET': 'FREQ'},
 {'SOURCE': 'countrycode', 'TARGET': 'REF_AREA'},
 {'SOURCE': 'indicator', 'TARGET': 'INDICATOR'},
 {'SOURCE': 'NA', 'TARGET': 'SEX'},
 {'SOURCE': 'NA', 'TARGET': 'AGE'},
 {'SOURCE': 'NA', 'TARGET': 'URBANISATION'},
 {'SOURCE': 'indicator', 'TARGET': 'UNIT_MEASURE'},
 {'SOURCE': 'welftype', 'TARGET': 'COMP_BREAKDOWN_1'},
 {'SOURCE': 'period', 'TARGET': 'COMP_BREAKDOWN_2'},
 {'SOURCE': 'NA', 'TARGET': 'COMP_BREAKDOWN_3'},
 {'SOURCE': 'year', 'TARGET': 'TIME_PERIOD'},
 {'SOURCE': 'NA', 'TARGET': 'AGG_METHOD'},
 {'SOURCE': 'indicator', 'TARGET': 'UNIT_TYPE'},
 {'SOURCE': 'NA', 'TARGET': 'DECIMALS'},
 {'SOURCE': 'NA', 'TARGET': 'DATABASE_ID'},
 {'SOURCE': 'NA', 'TARGET': 'TIME_FORMAT'},
 {'SOURCE': 'NA', 'TARGET': 'COMMENT_TS'},
 {'SOURCE': 'NA', 'TARGET': 'COMMENT_OBS'},
 {'SOURCE': 'NA', 'TARGET': 'UNIT_MULT'},
 {'SOURCE': 'NA', 'TARGET': 'OBS_STATUS'},
 {'SOURCE': 'NA', 'TARGET': 'DATA_SOURCE'},
 {'SOURCE': 'NA', 'TARGET': 'OBS_CONF'},
 {'SOURCE': 'v

In [16]:
if isinstance(components_map, list):
	components_map = pd.DataFrame(components_map)

In [17]:
components_map

Unnamed: 0,SOURCE,TARGET
0,,FREQ
1,countrycode,REF_AREA
2,indicator,INDICATOR
3,,SEX
4,,AGE
5,,URBANISATION
6,indicator,UNIT_MEASURE
7,welftype,COMP_BREAKDOWN_1
8,period,COMP_BREAKDOWN_2
9,,COMP_BREAKDOWN_3


In [18]:
result_df = pd.DataFrame(columns=components_map["TARGET"].values)
result_df

Unnamed: 0,FREQ,REF_AREA,INDICATOR,SEX,AGE,URBANISATION,UNIT_MEASURE,COMP_BREAKDOWN_1,COMP_BREAKDOWN_2,COMP_BREAKDOWN_3,TIME_PERIOD,AGG_METHOD,UNIT_TYPE,DECIMALS,DATABASE_ID,TIME_FORMAT,COMMENT_TS,COMMENT_OBS,UNIT_MULT,OBS_STATUS,DATA_SOURCE,OBS_CONF,OBS_VALUE


In [20]:
for _, row in components_map.iterrows():
	source_col = row["SOURCE"]
	target_col = row["TARGET"]

	# If source_col exists in raw, populate the corresponding column in result_df
	if source_col in raw.columns:
		result_df[target_col] = raw[source_col]

In [21]:
result_df

Unnamed: 0,FREQ,REF_AREA,INDICATOR,SEX,AGE,URBANISATION,UNIT_MEASURE,COMP_BREAKDOWN_1,COMP_BREAKDOWN_2,COMP_BREAKDOWN_3,TIME_PERIOD,AGG_METHOD,UNIT_TYPE,DECIMALS,DATABASE_ID,TIME_FORMAT,COMMENT_TS,COMMENT_OBS,UNIT_MULT,OBS_STATUS,DATA_SOURCE,OBS_CONF,OBS_VALUE
0,,ALB,WB.SP.meantotal,,,,WB.SP.meantotal,CONS,2008-2012,,2008,,WB.SP.meantotal,,,,,,,,,,10.038170
1,,ALB,WB.SP.meantotal,,,,WB.SP.meantotal,CONS,2008-2012,,2012,,WB.SP.meantotal,,,,,,,,,,9.517231
2,,ALB,WB.SP.meantotal,,,,WB.SP.meantotal,CONS,2014-2017,,2014,,WB.SP.meantotal,,,,,,,,,,10.141310
3,,ALB,WB.SP.meantotal,,,,WB.SP.meantotal,CONS,2014-2017,,2017,,WB.SP.meantotal,,,,,,,,,,12.411220
4,,ARE,WB.SP.meantotal,,,,WB.SP.meantotal,INC,2013-2018,,2013,,WB.SP.meantotal,,,,,,,,,,72.450951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6907,,ZAF,WB.SP.growthb40,,,,WB.SP.growthb40,CONS,2010-2014,,2014,,WB.SP.growthb40,,,,,,,,,,-1.360486
6908,,ZMB,WB.SP.growthb40,,,,WB.SP.growthb40,CONS,2010-2015,,2010,,WB.SP.growthb40,,,,,,,,,,-1.976578
6909,,ZMB,WB.SP.growthb40,,,,WB.SP.growthb40,CONS,2010-2015,,2015,,WB.SP.growthb40,,,,,,,,,,-1.976578
6910,,ZWE,WB.SP.growthb40,,,,WB.SP.growthb40,CONS,2011-2017,,2011,,WB.SP.growthb40,,,,,,,,,,-3.749366


In [22]:
print(raw.shape)
print(result_df.shape)

(6912, 8)
(6912, 23)


## function

In [None]:
def transform_source_to_target(
		raw: pd.DataFrame, 
		mapping: dict
	) -> pd.DataFrame:
	"""Transforms raw DataFrame into the format defined by components_map.
	
	This function creates a new dataframe with columns as defined in components_map['TARGET'] and populates it with data from the raw DataFrame based on the columns names in the ['SOURCE'].

	Args:
		raw (pd.DataFrame): The input DataFrame with raw data.
		mapping (dict): The master mapping dictionary containing a mapping between the input file columns, and the columns defined in the schema.

	Returns:
		pd.DataFrame: The transformed DataFrame with columns as defined in components_map['TARGET'].
	"""
	# Create an empty DataFrame with columns as defined in components_map['TARGET']
	try: 
		components_map = mapping["components"]

		# If the components_map is a list, create a dataframe with source and target columns
		if isinstance(components_map, list):
			components_map = pd.DataFrame(components_map)
		
		# Create an empty DataFrame with target columns
		result_df = pd.DataFrame(columns=components_map["TARGET"].values)

		# Iterate over the components_map DataFrame and map the columns
		for _, row in components_map.iterrows():
			source_col = row["SOURCE"]
			target_col = row["TARGET"]

			# If source_col exists in raw, populate the corresponding column in result_df
			if source_col in raw.columns:
				result_df[target_col] = raw[source_col]

		return result_df
	
	except KeyError as e:
		raise KeyError("The mapping file should contain 'components' key. Please make sure the mapping file has this key.") from e

## Adding tests

In [26]:
test_df = pd.DataFrame({
            "col_a": [1, 2, 3],
            "col_b": ["x", "y", "z"],
            "col_extra": [10, 20, 30]
        })

test_mapping = {
            "components": [
                {"SOURCE": "col_a", "TARGET": "target_a"},
                {"SOURCE": "col_b", "TARGET": "target_b"},
                {"SOURCE": "NA", "TARGET": "target_c"}
            ]
        }

In [27]:
result_df = transform_source_to_target(test_df, test_mapping)
print(result_df)

   target_a target_b target_c
0         1        x      NaN
1         2        y      NaN
2         3        z      NaN


In [29]:
result_df["target_c"][0]

nan