# Transformation component design

## Transformation schema input json topology

The transformation Component takes the input file ```transformation_schema.json```, which is a multi-layered dictionary, contains all column-to-column, value-to-value pairing schemas.

```Json
{
    "FIGO_stage": {
        "match": "figo_stage",
        "default": "Unknown",
        "values": {
            'iiia': ["Stage IIIA"],
            'ii': ["Stage II"],
            'iiib': ["Stage IIIB"],
            'ia': ["Stage IA"],
            'ivb': ["Stage IVB"],
            'iiic2': ["Stage IIIC2"],
            'iiic1': ["Stage IIIC1"],
            'ib': ["Stage IB"],
        },
        "ignore_case": true,
    },
    "Tumor_Site": {
        "match": "tissue_or_organ_of_origin",
        "default": "other",
        "values": {
            'specify': ["Other specified parts of pancreas",
                        "Other specified parts of female genital organs",
                        "Other specified parts of male genital organs"],
            'posterior endometrium': ["Posterior mediastinum"],
            'anterior endometrium': ["Anterior mediastinum"],
        },
        "ignore_case": true,
    },
    "Gender": {
        "match": "gender",
        "default": "unknown",
        "values": {
            'female': ["female"],
        },
        "ignore_case": true,
    }
}
```

In [17]:
import pandas as pd

dou_groundtruth = pd.read_csv("../data/table-matching-ground-truth/ground-truth/Dou.csv")
dou = pd.read_excel("../data/datalake/Dou.xlsx", sheet_name="UCEC_CPTAC3_meta_table_V2.1")

dou[["FIGO_stage", "Tumor_Site", "Gender"]]

Unnamed: 0,FIGO_stage,Tumor_Site,Gender
0,IA,Anterior endometrium,Female
1,IA,Posterior endometrium,Female
2,IA,"Other, specify",Female
3,,,
4,IA,"Other, specify",Female
...,...,...,...
148,,,
149,,,
150,,,
151,,,


In [16]:
import json

sample_transformation_schema = {
    "FIGO_stage": {
        "match": "figo_stage",
        "default": "Unknown",
        "values": {
            'iiia': ["Stage IIIA"],
            'ii': ["Stage II"],
            'iiib': ["Stage IIIB"],
            'ia': ["Stage IA"],
            'ivb': ["Stage IVB"],
            'iiic2': ["Stage IIIC2"],
            'iiic1': ["Stage IIIC1"],
            'ib': ["Stage IB"],
        },
        "ignore_case": True,
    },
    "Tumor_Site": {
        "match": "tissue_or_organ_of_origin",
        "default": "other",
        "values": {
            'specify': ["Other specified parts of pancreas",
                        "Other specified parts of female genital organs",
                        "Other specified parts of male genital organs"],
            'posterior endometrium': ["Posterior mediastinum"],
            'anterior endometrium': ["Anterior mediastinum"],
        },
        "ignore_case": True,
    },
    "Gender": {
        "match": "gender",
        "default": "unknown",
        "values": {
            'female': ["female"],
        },
        "ignore_case": True,
    },
}

with open('transformation_schema.json', 'w') as f:
    json.dump(sample_transformation_schema, f)


sample_transformation_schema = json.load(open('transformation_schema.json'))

class Transformation:
    def __init__(self, schema):
        self.schema = schema

    def map(self, df):
        for column in self.schema:
            if column not in df.columns:
                continue
            match = self.schema[column]["match"]
            default = self.schema[column]["default"]
            values = self.schema[column]["values"]
            values = {k: ", ".join(vs) for k, vs in values.items()}
            print(values)

            if self.schema[column].get("ignore_case"):
                df[column] = df[column].str.lower().map(values).fillna(default)
            else:
                df[column] = df[column].map(values).fillna(default)
            
        return df
    
transformation = Transformation(sample_transformation_schema)
dou_new = transformation.map(dou)

dou_new[["FIGO_stage", "Tumor_Site", "Gender"]]

{'iiia': 'Stage IIIA', 'ii': 'Stage II', 'iiib': 'Stage IIIB', 'ia': 'Stage IA', 'ivb': 'Stage IVB', 'iiic2': 'Stage IIIC2', 'iiic1': 'Stage IIIC1', 'ib': 'Stage IB'}
{'specify': 'Other specified parts of pancreas, Other specified parts of female genital organs, Other specified parts of male genital organs', 'posterior endometrium': 'Posterior mediastinum', 'anterior endometrium': 'Anterior mediastinum'}
{'female': 'female'}


Unnamed: 0,FIGO_stage,Tumor_Site,Gender
0,Stage IA,Anterior mediastinum,female
1,Stage IA,Posterior mediastinum,female
2,Stage IA,other,female
3,Unknown,other,unknown
4,Stage IA,other,female
...,...,...,...
148,Unknown,other,unknown
149,Unknown,other,unknown
150,Unknown,other,unknown
151,Unknown,other,unknown
