Produce vtreat data transform as a sequence of SQL statements, instead
of one large one. The idea is: it may be better to deal with a large
number of joins by sequencing through `UPDATE` instead of composition.

In [1]:

import datetime
import os
os.chdir('/Users/johnmount/Documents/work/pyvtreat/Examples/Database')
import sys
import numpy as np
import pandas as pd

from data_algebra.data_ops import *
import data_algebra.SQLite
import data_algebra.test_util
import vtreat
import vtreat.vtreat_db_adapter



In [2]:
# larger version of tests/test_db_adapter.py:test_db_adapter_monster()
def mk_example(n_rows:int = 100, n_vars:int = 50):
    step = 1/np.sqrt(n_vars)
    cols = dict()
    y = np.random.normal(size=n_rows)
    for i in range(n_vars):
        vname = f'v_{i}'
        v = np.random.choice(['a', 'b'], replace=True, size=n_rows)
        y = y + np.where(v == 'a', step, -step)
        cols[vname] = v
    vars = list(cols.keys())
    vars.sort()
    cols['y'] = y
    d = pd.DataFrame(cols)
    return d, vars




n_vars = 5

d, vars = mk_example(n_vars=n_vars)
d_app, _ = mk_example(n_vars=n_vars, n_rows=5)

In [3]:
outcome_name = "y"
cols_to_copy = [outcome_name]
columns = vars + cols_to_copy

treatment = vtreat.NumericOutcomeTreatment(
    cols_to_copy=cols_to_copy,
    outcome_name=outcome_name,
    params=vtreat.vtreat_parameters(
        {"sparse_indicators": False, "filter_to_recommended": False,}
    ),
)
d_train_treated = treatment.fit_transform(d)
d_app_treated = treatment.transform(d_app)

d_app_treated

Unnamed: 0,y,v_4_impact_code,v_4_deviation_code,v_4_prevalence_code,v_4_lev_b,v_4_lev_a,v_1_impact_code,v_1_deviation_code,v_1_prevalence_code,v_1_lev_a,...,v_0_impact_code,v_0_deviation_code,v_0_prevalence_code,v_0_lev_a,v_0_lev_b,v_3_impact_code,v_3_deviation_code,v_3_prevalence_code,v_3_lev_a,v_3_lev_b
0,-0.816967,-0.497151,1.305699,0.53,1.0,0.0,-0.442517,1.312199,0.48,0.0,...,-0.408168,1.442042,0.47,0.0,1.0,-0.574096,1.249566,0.48,0.0,1.0
1,-0.046334,0.55755,1.293659,0.47,0.0,1.0,-0.442517,1.312199,0.48,0.0,...,0.375188,1.258093,0.53,1.0,0.0,0.529005,1.32573,0.52,1.0,0.0
2,0.644754,0.55755,1.293659,0.47,0.0,1.0,-0.442517,1.312199,0.48,0.0,...,-0.408168,1.442042,0.47,0.0,1.0,0.529005,1.32573,0.52,1.0,0.0
3,1.086406,-0.497151,1.305699,0.53,1.0,0.0,0.408814,1.359749,0.52,1.0,...,0.375188,1.258093,0.53,1.0,0.0,0.529005,1.32573,0.52,1.0,0.0
4,-0.467013,-0.497151,1.305699,0.53,1.0,0.0,0.408814,1.359749,0.52,1.0,...,-0.408168,1.442042,0.47,0.0,1.0,-0.574096,1.249566,0.48,0.0,1.0


In [4]:
transform_as_data = treatment.description_matrix()
source_descr = TableDescription(
    table_name='d_app',
    column_names=columns,
)


In [5]:
db_model = data_algebra.SQLite.SQLiteModel()
treatment_table_name = 'transform_as_data'
stage_3_name = 'vtreat_stage_3_table'
result_name = 'data_treated'

In [6]:
sql_sequence = vtreat.vtreat_db_adapter.as_sql_update_sequence(
    db_model=db_model,
    source=source_descr,
    vtreat_descr=transform_as_data,
    treatment_table_name=treatment_table_name,
    stage_3_name=stage_3_name,
    result_name=result_name)

In [7]:
db_handle = data_algebra.SQLite.example_handle()
_ = db_handle.insert_table(d_app, table_name=source_descr.table_name)


In [8]:
db_handle.insert_table(transform_as_data, table_name=treatment_table_name)
for sql in sql_sequence:
    db_handle.execute(sql)

In [9]:
db_res = db_handle.read_query(
    f'SELECT * FROM {db_model.quote_identifier(result_name)}')
db_res

Unnamed: 0,v_4_lev_a,v_0_lev_a,v_3_lev_a,v_1_lev_a,v_3_lev_b,y,v_4_lev_b,v_0_lev_b,v_1_lev_b,v_2_lev_a,...,v_1_prevalence_code,v_2_deviation_code,v_2_impact_code,v_2_prevalence_code,v_3_deviation_code,v_3_impact_code,v_3_prevalence_code,v_4_deviation_code,v_4_impact_code,v_4_prevalence_code
0,0.0,0.0,0.0,0.0,1.0,-0.816967,1.0,1.0,1.0,1.0,...,0.48,1.385877,0.378126,0.49,1.249566,-0.574096,0.48,1.305699,-0.497151,0.53
1,1.0,1.0,1.0,0.0,0.0,-0.046334,0.0,0.0,1.0,1.0,...,0.48,1.385877,0.378126,0.49,1.32573,0.529005,0.52,1.293659,0.55755,0.47
2,1.0,0.0,1.0,0.0,0.0,0.644754,0.0,1.0,1.0,1.0,...,0.48,1.385877,0.378126,0.49,1.32573,0.529005,0.52,1.293659,0.55755,0.47
3,0.0,1.0,1.0,1.0,0.0,1.086406,1.0,0.0,0.0,0.0,...,0.52,1.320701,-0.36822,0.51,1.32573,0.529005,0.52,1.305699,-0.497151,0.53
4,0.0,0.0,0.0,1.0,1.0,-0.467013,1.0,1.0,0.0,1.0,...,0.52,1.385877,0.378126,0.49,1.249566,-0.574096,0.48,1.305699,-0.497151,0.53


In [10]:
assert data_algebra.test_util.equivalent_frames(d_app_treated, db_res)

In [11]:
1 + 1  # show we made it to here.

2

In [12]:
for sql in sql_sequence:
    print(sql + ';\n\n')

DROP TABLE IF EXISTS "vtreat_stage_3_table";


DROP TABLE IF EXISTS "data_treated";


CREATE TABLE "vtreat_stage_3_table" AS 
-- data_algebra SQL https://github.com/WinVector/data_algebra
--  dialect: SQLiteModel
--       string quote: '
--   identifier quote: "
WITH
 "extend_0" AS (
  SELECT  -- .extend({ 'v_4_lev_b': "(v_4.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_4_lev_a': "(v_4.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_1_lev_a': "(v_1.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_1_lev_b': "(v_1.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_2_lev_b': "(v_2.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_2_lev_a': "(v_2.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_0_lev_a': "(v_0.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_0_lev_b': "(v_0.coalesce('_NA_') == 'b').if_else(1.0, 0.0)", 'v_3_lev_a': "(v_3.coalesce('_NA_') == 'a').if_else(1.0, 0.0)", 'v_3_lev_b': "(v_3.coalesce('_NA_') == 'b').if_else(1.0, 0.0)"})
   "y" ,
   "v_0" ,
   "v_1" ,
   "v_2" ,
   "v_3" 