A demonstration of converting a [Python vtreat](https://github.com/WinVector/vtreat) transformation into a [data algebra](https://github.com/WinVector/data_algebra) pipeline, which can then in turn be converted to SQL queries.
[R vtreat](https://winvector.github.io/vtreat/) already has similar functionality with [as_rquery_plan()](https://winvector.github.io/vtreat/reference/as_rquery_plan.html).

In [1]:
import pandas as pd

from data_algebra.data_ops import *
import data_algebra.SQLite
import vtreat
from vtreat_db_adapter import as_data_algebra_pipeline

In [2]:
# Data from:
# https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008

data = pd.read_csv("diabetes_head.csv")

outcome_name = "readmitted"
cols_to_copy = ["encounter_id", "patient_nbr"] + [outcome_name]
#vars = [c for c in data.columns if c not in cols_to_copy]
vars = ['weight']
columns = vars + cols_to_copy

data.loc[:, columns]


Unnamed: 0,weight,encounter_id,patient_nbr,readmitted
0,,2278392,8222157,False
1,,64410,86047875,False
2,,500364,82442376,False
3,,35754,82637451,False
4,,55842,84259809,False
...,...,...,...,...
995,,8860284,94419315,False
996,,8860944,338247,False
997,,8864718,695439,False
998,[100-125),8866632,103586670,False


In [3]:
treatment = vtreat.BinomialOutcomeTreatment(
    cols_to_copy=cols_to_copy,
    outcome_name=outcome_name,
    outcome_target=True,
    params=vtreat.vtreat_parameters(
        {"sparse_indicators": False, "filter_to_recommended": False,}
    ),
)
data_treated = treatment.fit_transform(data.loc[:, columns])

data_treated

Unnamed: 0,encounter_id,patient_nbr,readmitted,weight_is_bad,weight_logit_code,weight_prevalence_code,weight_lev__NA_
0,2278392,8222157,False,1.0,0.004795,0.993,1.0
1,64410,86047875,False,1.0,0.005997,0.993,1.0
2,500364,82442376,False,1.0,0.007164,0.993,1.0
3,35754,82637451,False,1.0,0.007164,0.993,1.0
4,55842,84259809,False,1.0,0.008366,0.993,1.0
...,...,...,...,...,...,...,...
995,8860284,94419315,False,1.0,0.007164,0.993,1.0
996,8860944,338247,False,1.0,0.007164,0.993,1.0
997,8864718,695439,False,1.0,0.005997,0.993,1.0
998,8866632,103586670,False,0.0,0.000000,0.001,0.0


In [4]:
transform_as_data = treatment.description_matrix()

transform_as_data

Unnamed: 0,treatment_class,treatment,orig_var,variable,value,replacement
0,IndicateMissingTransform,missing_indicator,weight,weight_is_bad,_NA_,1.0
1,MappedCodeTransform,logit_code,weight,weight_logit_code,[0-25),0.0
2,MappedCodeTransform,logit_code,weight,weight_logit_code,[100-125),0.0
3,MappedCodeTransform,logit_code,weight,weight_logit_code,[50-75),0.0
4,MappedCodeTransform,logit_code,weight,weight_logit_code,[75-100),-2.129774
5,MappedCodeTransform,logit_code,weight,weight_logit_code,_NA_,0.006737
6,MappedCodeTransform,prevalence_code,weight,weight_prevalence_code,[0-25),0.001
7,MappedCodeTransform,prevalence_code,weight,weight_prevalence_code,[100-125),0.001
8,MappedCodeTransform,prevalence_code,weight,weight_prevalence_code,[50-75),0.001
9,MappedCodeTransform,prevalence_code,weight,weight_prevalence_code,[75-100),0.004


In [5]:
ops = as_data_algebra_pipeline(
    source=descr(data=data.loc[:, columns]),
    vtreat_descr=transform_as_data,
    treatment_table_name='transform_as_data',
)

print(ops)

(
    TableDescription(
        table_name="data",
        column_names=["weight", "encounter_id", "patient_nbr", "readmitted"],
    )
    .extend(
        {
            "weight_is_bad": "(weight.is_bad()).if_else(1.0, 0.0)",
            "weight_lev__NA_": "(weight.coalesce('_NA_') == '_NA_').if_else(1.0, 0.0)",
            "vtreat_join_key": "weight.coalesce('_NA_')",
        }
    )
    .natural_join(
        b=TableDescription(
            table_name="transform_as_data",
            column_names=[
                "treatment_class",
                "treatment",
                "orig_var",
                "variable",
                "value",
                "replacement",
            ],
        )
        .select_rows(
            "(treatment_class == 'MappedCodeTransform') and (orig_var == 'weight') and (variable == 'weight_logit_code')"
        )
        .extend({"vtreat_join_key": "variable", "weight_logit_code": "replacement"})
        .select_columns(["vtreat_join_key", "weight_lo

In [6]:
transformed = ops.eval({'data': data.loc[:, columns], 'transform_as_data': transform_as_data})

transformed

Unnamed: 0,encounter_id,patient_nbr,readmitted,weight_is_bad,weight_lev__NA_,weight_logit_code,weight_prevalence_code
0,2278392,8222157,False,1.0,1.0,,
1,64410,86047875,False,1.0,1.0,,
2,500364,82442376,False,1.0,1.0,,
3,35754,82637451,False,1.0,1.0,,
4,55842,84259809,False,1.0,1.0,,
...,...,...,...,...,...,...,...
995,8860284,94419315,False,1.0,1.0,,
996,8860944,338247,False,1.0,1.0,,
997,8864718,695439,False,1.0,1.0,,
998,8866632,103586670,False,0.0,0.0,,


In [7]:
db_model = data_algebra.SQLite.SQLiteModel()

sql = db_model.to_sql(ops)
print(sql)


-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SQLiteModel
--       string quote: '
--   identifier quote: "
WITH
 "extend_0" AS (
  SELECT  -- .extend({ 'weight_is_bad': '(weight.is_bad()).if_else(1.0, 0.0)', 'weight_lev__NA_': "(weight.coalesce('_NA_') == '_NA_').if_else(1.0, 0.0)", 'vtreat_join_key': "weight.coalesce('_NA_')"})
   "weight" ,
   "patient_nbr" ,
   "encounter_id" ,
   "readmitted" ,
   CASE WHEN is_bad("weight") THEN 1.0 WHEN NOT is_bad("weight") THEN 0.0 ELSE NULL END AS "weight_is_bad" ,
   CASE WHEN (COALESCE("weight", '_NA_') = '_NA_') THEN 1.0 WHEN NOT (COALESCE("weight", '_NA_') = '_NA_') THEN 0.0 ELSE NULL END AS "weight_lev__NA_" ,
   COALESCE("weight", '_NA_') AS "vtreat_join_key"
  FROM
   "data"
 ) ,
 "table_reference_1" AS (
  SELECT
   "variable" ,
   "replacement" ,
   "orig_var" ,
   "treatment_class"
  FROM
   "transform_as_data"
 ) ,
 "select_rows_2" AS (
  SELECT  -- .select_rows("(treatment_class == 'MappedCodeTransform')