Composition by nesting can run into issues. Recursive code may need excessive stack to examine largely composed objects, and one may also run into query size limits. An alternative to nesting composition as demonstrated [here](https://github.com/WinVector/pyvtreat/blob/main/Examples/Database/vtreat_db_adapter.ipynb) is composing by sequential SQL updates. [Python vtreat](https://github.com/WinVector/pyvtreat) supplies SQL update composition, which we will demonstrate here.

The idea is: it may be better to deal with a large number of joins by sequencing through `UPDATE` instead of composition into a large single query.

Let's import our packages and build our example data.

In [1]:

import datetime
import os
import sys
import numpy as np
import pandas as pd

from data_algebra.data_ops import *
import data_algebra.SQLite
import data_algebra.test_util
import vtreat
import vtreat.vtreat_db_adapter



In [2]:
# larger version of tests/test_db_adapter.py:test_db_adapter_monster()
def mk_example(n_rows:int = 100, n_vars:int = 50):
    step = 1/np.sqrt(n_vars)
    cols = dict()
    y = np.random.normal(size=n_rows)
    for i in range(n_vars):
        vname = f'v_{i}'
        v = np.random.choice(['a', 'b'], replace=True, size=n_rows)
        y = y + np.where(v == 'a', step, -step)
        cols[vname] = v
    vars = list(cols.keys())
    vars.sort()
    cols['y'] = y
    d = pd.DataFrame(cols)
    return d, vars


n_vars = 5

d, vars = mk_example(n_vars=n_vars)
d_app, _ = mk_example(n_vars=n_vars, n_rows=5)

In [3]:
outcome_name = "y"
cols_to_copy = [outcome_name]
columns = vars + cols_to_copy

Now let's build our vtreat data treatment plan.

In [4]:
treatment = vtreat.NumericOutcomeTreatment(
    cols_to_copy=cols_to_copy,
    outcome_name=outcome_name,
    params=vtreat.vtreat_parameters(
        {"sparse_indicators": False, "filter_to_recommended": False,}
    ),
)
d_train_treated = treatment.fit_transform(d)
d_app_treated = treatment.transform(d_app)

d_app_treated

Unnamed: 0,y,v_2_impact_code,v_2_deviation_code,v_2_prevalence_code,v_2_lev_a,v_2_lev_b,v_3_impact_code,v_3_deviation_code,v_3_prevalence_code,v_3_lev_b,...,v_1_impact_code,v_1_deviation_code,v_1_prevalence_code,v_1_lev_a,v_1_lev_b,v_4_impact_code,v_4_deviation_code,v_4_prevalence_code,v_4_lev_b,v_4_lev_a
0,2.72516,0.492885,1.272021,0.52,1.0,0.0,0.562714,1.182533,0.39,0.0,...,0.468717,1.311067,0.51,1.0,0.0,0.232438,1.271815,0.48,0.0,1.0
1,-0.541291,0.492885,1.272021,0.52,1.0,0.0,-0.364053,1.35433,0.61,1.0,...,-0.489594,1.247043,0.49,0.0,1.0,0.232438,1.271815,0.48,0.0,1.0
2,-0.805834,0.492885,1.272021,0.52,1.0,0.0,-0.364053,1.35433,0.61,1.0,...,0.468717,1.311067,0.51,1.0,0.0,-0.208719,1.421408,0.52,1.0,0.0
3,-0.18788,-0.532116,1.261658,0.48,0.0,1.0,-0.364053,1.35433,0.61,1.0,...,-0.489594,1.247043,0.49,0.0,1.0,0.232438,1.271815,0.48,0.0,1.0
4,-0.277967,-0.532116,1.261658,0.48,0.0,1.0,-0.364053,1.35433,0.61,1.0,...,-0.489594,1.247043,0.49,0.0,1.0,0.232438,1.271815,0.48,0.0,1.0


We then export this data treatment as a table. This table encodes proposed data treatment. This can be used to build a SQL query (as we showed [here](https://github.com/WinVector/pyvtreat/blob/main/Examples/Database/vtreat_db_adapter.ipynb)), or be used to drive a stored procedure, or be translated into a sequence of SQL updates. We are demonstrating the SQL update method here.

In [5]:
transform_as_data = treatment.description_matrix()

Now we adapt the description of the data treatment into the SQL steps.

In [6]:
db_model = data_algebra.SQLite.SQLiteModel()  # model of database syntax

In [7]:
source_descr = TableDescription(  # description of data to be transformed
    table_name='d_app',
    column_names=columns,
)


In [8]:
treatment_table_name = 'transform_as_data'  # name to use for treatment description
stage_3_name = 'vtreat_stage_3_table'  # name for a temp table
result_name = 'data_treated'  # name for desired result

In [9]:
# build the SQL sequence
sql_sequence = vtreat.vtreat_db_adapter.as_sql_update_sequence(
    db_model=db_model,
    source=source_descr,
    vtreat_descr=transform_as_data,
    treatment_table_name=treatment_table_name,
    stage_3_name=stage_3_name,
    result_name=result_name)

This sequence of SQL commands can then be used in our target database.

In [10]:
db_handle = data_algebra.SQLite.example_handle()

# simulate data already in database, by inserting it
_ = db_handle.insert_table(d_app, table_name=source_descr.table_name)


In [11]:
# insert the data transform specification
db_handle.insert_table(transform_as_data, table_name=treatment_table_name)

# execute all of the sql statements in order
for sql in sql_sequence:
    db_handle.execute(sql)

At this point we have created the prepared data table. This is purely in the database with no required data transport, other than the copying of the transform description into the database (the insertion of the data is a step we assume would not be needed in an actual application).

We can then take a look at the result as follows.

In [12]:
db_res = db_handle.read_query(
    f'SELECT * FROM {db_model.quote_identifier(result_name)}')

db_res

Unnamed: 0,v_0_lev_b,v_4_lev_b,v_0_lev_a,v_1_lev_a,v_3_lev_a,v_3_lev_b,v_1_lev_b,v_4_lev_a,v_2_lev_b,y,...,v_1_prevalence_code,v_2_deviation_code,v_2_impact_code,v_2_prevalence_code,v_3_deviation_code,v_3_impact_code,v_3_prevalence_code,v_4_deviation_code,v_4_impact_code,v_4_prevalence_code
0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,2.72516,...,0.51,1.272021,0.492885,0.52,1.182533,0.562714,0.39,1.271815,0.232438,0.48
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,-0.541291,...,0.49,1.272021,0.492885,0.52,1.35433,-0.364053,0.61,1.271815,0.232438,0.48
2,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.805834,...,0.51,1.272021,0.492885,0.52,1.35433,-0.364053,0.61,1.421408,-0.208719,0.52
3,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,-0.18788,...,0.49,1.261658,-0.532116,0.48,1.35433,-0.364053,0.61,1.271815,0.232438,0.48
4,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,-0.277967,...,0.49,1.261658,-0.532116,0.48,1.35433,-0.364053,0.61,1.271815,0.232438,0.48


In [13]:
# confirm database matches in memory example
assert data_algebra.test_util.equivalent_frames(d_app_treated, db_res)

And that is the SQL update realization of a vtreat machine learning data preparation transform.

In [14]:
db_handle.close()

In [15]:
# print all the SQL steps
for sql in sql_sequence:
    print(sql + ';\n\n')

DROP TABLE IF EXISTS "vtreat_stage_3_table";


DROP TABLE IF EXISTS "data_treated";


CREATE TABLE "vtreat_stage_3_table" AS 
-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SQLiteModel
--       string quote: '
--   identifier quote: "
WITH
 "extend_0" AS (
  SELECT  -- .extend({ 'v_2_lev_a': "(v_2.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_2_lev_b': "(v_2.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_3_lev_b': "(v_3.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_3_lev_a': "(v_3.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_0_lev_a': "(v_0.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_0_lev_b': "(v_0.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_1_lev_a': "(v_1.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_1_lev_b': "(v_1.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_4_lev_b': "(v_4.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_4_lev_a': "(v_4.coalesce('_NA_') == 'a').if_else(1.0, 0.0)"})
   "v_0" ,
   "v_1" ,
   "v_2" ,
   "v_3" ,
   "v_4