Produce vtreat data transform as a sequence of SQL statements, instead
of one large one. The idea is: it may be better to deal with a large
number of joins by sequencing through `UPDATE` instead of composition.

In [1]:

import datetime
import os
os.chdir('/Users/johnmount/Documents/work/pyvtreat/Examples/Database')
import sys
import numpy as np
import pandas as pd

from data_algebra.data_ops import *
import data_algebra.SQLite
import data_algebra.test_util
import vtreat
import vtreat.vtreat_db_adapter



In [2]:
# larger version of tests/test_db_adapter.py:test_db_adapter_monster()
def mk_example(n_rows:int = 100, n_vars:int = 50):
    step = 1/np.sqrt(n_vars)
    cols = dict()
    y = np.random.normal(size=n_rows)
    for i in range(n_vars):
        vname = f'v_{i}'
        v = np.random.choice(['a', 'b'], replace=True, size=n_rows)
        y = y + np.where(v == 'a', step, -step)
        cols[vname] = v
    vars = list(cols.keys())
    vars.sort()
    cols['y'] = y
    d = pd.DataFrame(cols)
    return d, vars




n_vars = 5

d, vars = mk_example(n_vars=n_vars)
d_app, _ = mk_example(n_vars=n_vars, n_rows=5)

In [3]:
outcome_name = "y"
cols_to_copy = [outcome_name]
columns = vars + cols_to_copy

treatment = vtreat.NumericOutcomeTreatment(
    cols_to_copy=cols_to_copy,
    outcome_name=outcome_name,
    params=vtreat.vtreat_parameters(
        {"sparse_indicators": False, "filter_to_recommended": False,}
    ),
)
d_train_treated = treatment.fit_transform(d)
d_app_treated = treatment.transform(d_app)

d_app_treated

Unnamed: 0,y,v_1_impact_code,v_1_deviation_code,v_1_prevalence_code,v_1_lev_b,v_1_lev_a,v_2_impact_code,v_2_deviation_code,v_2_prevalence_code,v_2_lev_b,...,v_0_impact_code,v_0_deviation_code,v_0_prevalence_code,v_0_lev_b,v_0_lev_a,v_4_impact_code,v_4_deviation_code,v_4_prevalence_code,v_4_lev_a,v_4_lev_b
0,0.81475,0.422799,1.514048,0.45,0.0,1.0,-0.330228,1.464325,0.52,1.0,...,-0.461147,1.413975,0.51,1.0,0.0,0.67167,1.179936,0.52,1.0,0.0
1,0.007853,0.422799,1.514048,0.45,0.0,1.0,-0.330228,1.464325,0.52,1.0,...,-0.461147,1.413975,0.51,1.0,0.0,-0.719968,1.346839,0.48,0.0,1.0
2,1.162144,-0.364301,1.276633,0.55,1.0,0.0,0.36233,1.329893,0.48,0.0,...,0.483767,1.30771,0.49,0.0,1.0,0.67167,1.179936,0.52,1.0,0.0
3,0.960849,0.422799,1.514048,0.45,0.0,1.0,0.36233,1.329893,0.48,0.0,...,0.483767,1.30771,0.49,0.0,1.0,0.67167,1.179936,0.52,1.0,0.0
4,-0.290128,0.422799,1.514048,0.45,0.0,1.0,0.36233,1.329893,0.48,0.0,...,-0.461147,1.413975,0.51,1.0,0.0,-0.719968,1.346839,0.48,0.0,1.0


In [4]:
transform_as_data = treatment.description_matrix()
source_descr = TableDescription(
    table_name='d_app',
    column_names=columns,
)


In [5]:
db_model = data_algebra.SQLite.SQLiteModel()
treatment_table_name = 'transform_as_data'
stage_3_name = 'vtreat_stage_3_table'
result_name = 'data_treated'

ops, map_vars, mapping_steps, stage_3_ops = vtreat.vtreat_db_adapter._build_data_pipelines_stages(
    source=source_descr,
    vtreat_descr=transform_as_data,
    treatment_table_name=treatment_table_name,
    stage_3_name=stage_3_name,
)

# give variables pre-update values
ops = ops.extend({
    v: '1.0' for v in map_vars
})

def update_code(i):
    step_i = mapping_steps[i]
    ov = step_i['ov']
    vi = step_i['vi']
    update_stmt = (f"""
WITH tmp_update AS (
  SELECT
    value AS {db_model.quote_identifier(ov)},
    replacement AS {db_model.quote_identifier(vi)}
  FROM
    {db_model.quote_identifier(treatment_table_name)}
  WHERE
    (treatment_class = {db_model.quote_string('MappedCodeTransform')})
    AND (orig_var = {db_model.quote_string(ov)})
    AND (variable == {db_model.quote_string(vi)})
)
UPDATE
  {db_model.quote_identifier(stage_3_name)}
SET {db_model.quote_identifier(vi)} = tmp_update.{db_model.quote_identifier(vi)}
FROM
  tmp_update
WHERE
   {db_model.quote_identifier(stage_3_name)}.{db_model.quote_identifier(ov)} = tmp_update.{db_model.quote_identifier(ov)}
""")
    return update_stmt


sql_sequence = (
    [ f'DROP TABLE IF EXISTS {db_model.quote_identifier(stage_3_name)}']
    + [ f'DROP TABLE IF EXISTS {db_model.quote_identifier(result_name)}']
    + [
        f'CREATE TABLE {db_model.quote_identifier(stage_3_name)} AS \n'
        + db_model.to_sql(ops)
    ]
    + [update_code(i) + ' ;' for i in range(len(mapping_steps))]
    + [
        f'CREATE TABLE {db_model.quote_identifier(result_name)} AS \n'
        + db_model.to_sql(stage_3_ops)
    ]
    + [ f'DROP TABLE {db_model.quote_identifier(stage_3_name)}']
)

In [6]:
db_handle = data_algebra.SQLite.example_handle()
_ = db_handle.insert_table(d_app, table_name=source_descr.table_name)


In [7]:
db_handle.insert_table(transform_as_data, table_name=treatment_table_name)
for sql in sql_sequence:
    db_handle.execute(sql)

In [8]:
db_res = db_handle.read_query(
    f'SELECT * FROM {db_model.quote_identifier(result_name)}')
db_res

Unnamed: 0,v_1_lev_a,v_0_lev_b,v_3_lev_b,y,v_2_lev_a,v_0_lev_a,v_2_lev_b,v_1_lev_b,v_3_lev_a,v_4_lev_b,...,v_1_prevalence_code,v_2_deviation_code,v_2_impact_code,v_2_prevalence_code,v_3_deviation_code,v_3_impact_code,v_3_prevalence_code,v_4_deviation_code,v_4_impact_code,v_4_prevalence_code
0,1.0,1.0,1.0,0.81475,0.0,0.0,1.0,0.0,0.0,0.0,...,0.45,1.464325,-0.330228,0.52,1.308183,-0.41933,0.54,1.179936,0.67167,0.52
1,1.0,1.0,0.0,0.007853,0.0,0.0,1.0,0.0,1.0,1.0,...,0.45,1.464325,-0.330228,0.52,1.441514,0.479138,0.46,1.346839,-0.719968,0.48
2,0.0,0.0,0.0,1.162144,1.0,1.0,0.0,1.0,1.0,0.0,...,0.55,1.329893,0.36233,0.48,1.441514,0.479138,0.46,1.179936,0.67167,0.52
3,1.0,0.0,1.0,0.960849,1.0,1.0,0.0,0.0,0.0,0.0,...,0.45,1.329893,0.36233,0.48,1.308183,-0.41933,0.54,1.179936,0.67167,0.52
4,1.0,1.0,1.0,-0.290128,1.0,0.0,0.0,0.0,0.0,1.0,...,0.45,1.329893,0.36233,0.48,1.308183,-0.41933,0.54,1.346839,-0.719968,0.48


In [9]:
assert data_algebra.test_util.equivalent_frames(d_app_treated, db_res)

In [10]:
1 + 1  # show we made it to here.

2