# Mapping component design

## Mapping schema input json topology

mapping_schema.json

```Json
{
    "input_column_1": {
        "match": "global_column_1",
        "default": "Unknown",
        "values": {
            "v1": "gv1",
            "v2": "gv2",
            "v3": "gv3"
        }
    },
    "input_column_2": {
        "match": "global_column_2",
        "default": "nan",
        "values": {
            "vv1": "gvv1",
            "vv2": "gvv2"
        }
    }
}

```

In [10]:
import pandas as pd

dou_groundtruth = pd.read_csv("../data/table-matching-ground-truth/ground-truth/Dou.csv")
dou = pd.read_excel("../data/datalake/Dou.xlsx", sheet_name="UCEC_CPTAC3_meta_table_V2.1")

dou[["FIGO_stage"]]

Unnamed: 0,FIGO_stage
0,IA
1,IA
2,IA
3,
4,IA
...,...
148,
149,
150,
151,


In [11]:
sample_mapping_schema = {
    "FIGO_stage": {
        "match": "figo_stage",
        "default": "Unknown",
        "values": {
            'iiia': "Stage IIIA",
            'ii': "Stage II",
            'iiib': "Stage IIIB",
            'ia': "Stage IA",
            'ivb': "Stage IVB",
            'iiic2': "Stage IIIC2",
            'iiic1': "Stage IIIC1",
            'ib': "Stage IB",
        }
    }
}

class Mapping:
    def __init__(self, schema):
        self.schema = schema

    def map(self, df):
        for column in self.schema:
            if column not in df.columns:
                continue
            match = self.schema[column]["match"]
            default = self.schema[column]["default"]
            values = self.schema[column]["values"]
            df[column] = df[column].str.lower().map(values).fillna(default)
        return df
    
mapping = Mapping(sample_mapping_schema)
dou_new = mapping.map(dou)

dou_new[["FIGO_stage"]]

Unnamed: 0,FIGO_stage
0,Stage IA
1,Stage IA
2,Stage IA
3,Unknown
4,Stage IA
...,...
148,Unknown
149,Unknown
150,Unknown
151,Unknown
