In [None]:
from SGShift import *
import pandas as pd
import numpy as np
df = pd.read_csv('../data/support2_processed.csv')
source_df = df[df['hospdead']==0].drop(columns = ['hospdead'])
target_df = df[df['hospdead']==1].drop(columns = ['hospdead'])

feature_names = source_df.drop(columns = ['charges']).columns
feature_names = np.array(feature_names)
source_X_raw = source_df[feature_names].values
source_y_raw = np.log10(source_df['charges'].to_numpy())
scaler = StandardScaler()
scaler.fit(source_X_raw)
source_X_raw = scaler.transform(source_X_raw)

target_X = target_df[feature_names].values
target_y = np.log10(target_df['charges'].to_numpy())
scaler = StandardScaler()
scaler.fit(target_X)
target_X = scaler.transform(target_X)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T, X_val, y_T, y_val = train_test_split(target_X, target_y, test_size=0.5, random_state=42)
lambdas = np.geomspace(0.001, 0.5, num=30)
sel_thrs = np.concatenate([np.linspace(0.05, 0.25, 5), np.linspace(0.3, 0.5, 3), np.linspace(0.6, 1, 3)])
for solver_name in ['decision_tree_reg', 'linear_regression', 'svm_reg', 'gb_regressor']:
    print(solver_name)
    solver = fit_model('regression', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)
    target_only = deepcopy(solver)
    target_only.fit(X_T, y_T)

    f_x_T = source_only.predict(X_T)
    f_x_val = source_only.predict(X_val)
    eval_results_source = evaluation(y_val, f_x_val, task = 'regression')
    eval_results_target = evaluation(y_val, target_only.predict(X_val), task = 'regression')
    print('source_only', eval_results_source)
    print('target_only', eval_results_target)

    for lambda_reg in lambdas:
        delta_estimated = estimate_delta(X_T, y_T, lambda_reg, source_only, task = 'regression')
        selected_naive = np.where(np.abs(delta_estimated) > 1e-4)[0]
        x_delta = X_val @ delta_estimated
        eval_results_naive = evaluation(y_val, f_x_val + x_delta, task = 'regression')
        print('naive', lambda_reg, eval_results_naive)
        print(len(selected_naive), feature_names[selected_naive], delta_estimated[selected_naive])
        if len(selected_naive) > 0:
            clf_poly = fit_nonparametric_classifier_with_offset(X_T, y_T, f_x_T, selected_naive, lambda_reg = lambda_reg,
                                                        degree=2, add_const=True)
            X_val_poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False).fit_transform(X_val[:, selected_naive])
            X_val_poly = sm.add_constant(X_val_poly, has_constant='add')
            p_val_new = clf_poly.predict(X_val_poly, offset=f_x_val)
            eval_results_naive_new = evaluation(y_val, p_val_new, task = 'regression')
            print('naive_poly', lambda_reg, len(selected_naive), eval_results_naive_new)

decision_tree_reg
source_only {'MSE': 0.20279376307930322, 'MAE': 0.3562976090055081}
target_only {'MSE': 0.07985305935279106, 'MAE': 0.21962008789469362}
naive 0.001 {'MSE': 0.18571413306816423, 'MAE': 0.35827022962378546}
56 ['age' 'sex' 'slos' 'd.time' 'num.co' 'edu' 'scoma' 'avtisst' 'sps' 'aps'
 'surv2m' 'surv6m' 'hday' 'diabetes' 'dementia' 'prg2m' 'prg6m' 'wblc'
 'hrt' 'resp' 'temp' 'pafi' 'alb' 'bili' 'crea' 'sod' 'ph' 'glucose' 'bun'
 'urine' 'adlsc' 'dzgroup_CHF' 'dzgroup_COPD' 'dzgroup_Cirrhosis'
 'dzgroup_Colon Cancer' 'dzgroup_Lung Cancer' 'dzgroup_MOSF w/Malig'
 'dzclass_ARF/MOSF' 'income_$11-$25k' 'income_$25-$50k' 'income_>$50k'
 'income_under $11k' 'race_asian' 'race_black' 'race_hispanic'
 'race_other' 'race_white' 'dnr_dnr before sadm' 'dnr_no dnr'
 'sfdm2_<2 mo. follow-up' 'sfdm2_Coma or Intub' 'sfdm2_SIP>=30'
 'sfdm2_adl>=4 (>=5 if sur)' 'sfdm2_no(M2 and SIP pres)' 'ca_metastatic'
 'ca_yes'] [-0.03155774  0.01608581 -0.05471887 -0.05471887  0.0073998   0.02183724
 

In [8]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T = target_X
y_T = target_y
lambdas = np.geomspace(0.001, 0.5, num=30)
sel_thrs = np.concatenate([np.linspace(0.02, 0.28, 14), np.linspace(0.3, 0.5, 5), np.linspace(0.6, 1, 5)])
for solver_name in ['decision_tree_reg', 'svm_reg', 'linear_regression', 'gb_regressor']:
    print(solver_name)
    solver = fit_model('regression', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)
    # print(source_only.coef_)
    target_only = deepcopy(solver)
    target_only.fit(X_T, y_T)
    # print(target_only.coef_)

    f_x_T = source_only.predict(X_T)
    f_x_val = source_only.predict(X_val)
    eval_results_source = evaluation(y_val, f_x_val, task = 'regression')
    eval_results_target = evaluation(y_val, target_only.predict(X_val), task = 'regression')
    ll_ = [0.01, 0.02, 0.05]
    pi = derandom_knock(solver, X_S, y_S, X_T, y_T, 100, ll_, task = 'regression')
    for l_ in ll_:
        print(f'hyper = {l_}')
        for sel_thr in sel_thrs:
            selected_naive_knock = np.where(pi[l_] >= sel_thr)[0]
            if len(selected_naive_knock) > 0: 
                delta_estimated = estimate_delta(X_T[:, selected_naive_knock], y_T, l_, f_x = f_x_T, task = 'regression')
                x_delta = X_val[:, selected_naive_knock] @ np.array(delta_estimated)
                shifted_log_odds = f_x_val + x_delta
            else: 
                shifted_log_odds = f_x_val
            p_val_naive_knock = 1 / (1 + np.exp(-shifted_log_odds))
            print('naive_knock', sel_thr, feature_names[selected_naive_knock], delta_estimated)

decision_tree_reg
hyper = 0.01
naive_knock 0.02 ['age' 'slos' 'd.time' 'edu' 'scoma' 'avtisst' 'sps' 'aps' 'surv2m'
 'surv6m' 'hday' 'wblc' 'temp' 'alb' 'bili' 'crea' 'ph' 'bun'
 'dzgroup_ARF/MOSF w/Sepsis' 'dzgroup_CHF' 'dzclass_ARF/MOSF'
 'dzclass_Cancer' 'income_>$50k' 'income_under $11k' 'race_asian'
 'race_hispanic' 'race_other' 'dnr_dnr before sadm'
 'sfdm2_<2 mo. follow-up' 'sfdm2_Coma or Intub'
 'sfdm2_adl>=4 (>=5 if sur)' 'ca_metastatic'] [-4.23331381e-02 -2.50719369e-02 -2.50719369e-02  9.18629534e-03
  3.90052327e-03  2.80413533e-02  3.17539917e-03  2.21715446e-02
 -1.71831683e-02 -6.84558559e-10  5.05825152e-02 -1.04292017e-02
  4.21648631e-03  1.79896018e-02  1.49178257e-02 -9.22515901e-03
  6.96286947e-03  4.63781869e-02  2.28025930e-02 -4.16135315e-03
  1.17086931e-02 -2.42887368e-02  1.11268276e-02 -3.70495378e-02
  2.73644554e-02  2.26154291e-02  1.02189598e-02 -7.09833662e-03
 -6.24494672e-10  1.90879797e-02  1.41059091e-02 -2.03136400e-02]
naive_knock 0.04 ['age' 'sl

In [9]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T = target_X
y_T = target_y
lambdas = np.geomspace(0.001, 0.5, num=30) / 8
sel_thrs = np.concatenate([np.linspace(0.02, 0.28, 14), np.linspace(0.3, 0.5, 5), np.linspace(0.6, 1, 5)])
for solver_name in ['decision_tree_reg', 'svm_reg', 'linear_regression', 'gb_regressor']:
    print(solver_name)
    solver = fit_model('regression', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)
    # print(source_only.coef_)
    target_only = deepcopy(solver)
    target_only.fit(X_T, y_T)
    # print(target_only.coef_)

    for lambda_reg in lambdas:
        lambda_beta = 0.9 * lambda_reg
        beta_est, delta_estimated = estimate_beta_delta(X_S, y_S, X_T, y_T, lambda_beta, lambda_reg, source_only, task = 'regression')
        selected = np.where(np.abs(delta_estimated) > 1e-4)[0]
        print('absorption', lambda_reg, len(selected), feature_names[selected], delta_estimated[selected])

decision_tree_reg
absorption 0.000125 53 ['age' 'sex' 'slos' 'd.time' 'num.co' 'edu' 'scoma' 'aps' 'surv2m'
 'surv6m' 'hday' 'diabetes' 'dementia' 'prg2m' 'prg6m' 'meanbp' 'wblc'
 'hrt' 'resp' 'temp' 'alb' 'bili' 'crea' 'sod' 'ph' 'glucose' 'bun'
 'urine' 'adlsc' 'dzgroup_ARF/MOSF w/Sepsis' 'dzgroup_CHF' 'dzgroup_COPD'
 'dzgroup_Cirrhosis' 'dzgroup_Lung Cancer' 'dzclass_ARF/MOSF'
 'dzclass_Cancer' 'income_$11-$25k' 'income_$25-$50k' 'income_>$50k'
 'income_under $11k' 'race_asian' 'race_hispanic' 'race_other'
 'race_white' 'dnr_dnr after sadm' 'dnr_dnr before sadm'
 'sfdm2_<2 mo. follow-up' 'sfdm2_Coma or Intub' 'sfdm2_SIP>=30'
 'sfdm2_adl>=4 (>=5 if sur)' 'sfdm2_no(M2 and SIP pres)' 'ca_metastatic'
 'ca_yes'] [-0.03208776  0.00175959 -0.01314197 -0.01314197 -0.00106105 -0.00502445
 -0.00691918  0.01909556 -0.01905794 -0.0087039   0.01400327  0.00597718
 -0.00263662 -0.01709565 -0.00107452  0.01148747 -0.01397848  0.00056806
 -0.0045582  -0.00417278  0.00926218  0.00240874 -0.01483637 

In [10]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T = target_X
y_T = target_y
lambdas = np.geomspace(0.001, 0.5, num=30) / 8
sel_thrs = np.concatenate([np.linspace(0.02, 0.28, 14), np.linspace(0.3, 0.5, 5), np.linspace(0.6, 1, 5)])
for solver_name in ['decision_tree_reg', 'svm_reg', 'linear_regression', 'gb_regressor']:
    print(solver_name)
    solver = fit_model('regression', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)
    target_only = deepcopy(solver)
    target_only.fit(X_T, y_T)

    f_x_T = source_only.predict(X_T)
    ll_ = [0.002, 0.005, 0.01, 0.02]
    pi = derandom_knock_misspecified(solver, X_S, y_S, X_T, y_T, 100, ll_, task = 'regression')
    for l_ in ll_:
        print(f'hyper = {l_}')
        beta_estimated, delta_estimated = estimate_beta_delta(X_S, y_S, X_T, y_T, 0.9*l_, l_, source_only, task = 'regression')
        for sel_thr in sel_thrs:
            selected_knock_absorption = np.where(pi[l_] >= sel_thr)[0]
            if len(selected_knock_absorption) > 0: 
                delta_estimated = estimate_delta(X_T[:, selected_knock_absorption], y_T, l_, f_x = (f_x_T + X_T @ np.array(beta_estimated)), task = 'regression')
            print('absorption_knock', sel_thr, feature_names[selected_knock_absorption], delta_estimated)

decision_tree_reg
hyper = 0.002
absorption_knock 0.02 ['age' 'slos' 'd.time' 'avtisst' 'sps' 'aps' 'surv2m' 'hday' 'prg2m' 'alb'
 'bun' 'dzgroup_ARF/MOSF w/Sepsis' 'dzgroup_CHF' 'dzgroup_Lung Cancer'
 'dzclass_ARF/MOSF' 'dzclass_Cancer' 'income_under $11k' 'race_asian'
 'dnr_dnr after sadm' 'dnr_dnr before sadm' 'sfdm2_<2 mo. follow-up'
 'sfdm2_Coma or Intub' 'sfdm2_adl>=4 (>=5 if sur)'
 'sfdm2_no(M2 and SIP pres)' 'ca_metastatic'] [-2.89991553e-02 -1.17814426e-02 -1.17814426e-02  1.94850578e-03
 -9.02023132e-10  2.22125497e-02 -1.56988173e-02  1.59652964e-02
 -1.31310727e-02  9.76511093e-03  1.93299552e-02  6.42995715e-03
 -1.35508941e-02  1.59074897e-10  1.58075657e-02 -3.08022352e-02
 -1.34471723e-02  1.11173397e-02  9.18774571e-03 -5.36312289e-03
 -7.41167822e-10  2.40105911e-02  3.14906200e-02  1.80249599e-02
 -2.21750579e-02]
absorption_knock 0.04 ['age' 'slos' 'd.time' 'avtisst' 'aps' 'surv2m' 'hday' 'alb' 'bun'
 'dzgroup_ARF/MOSF w/Sepsis' 'dzclass_ARF/MOSF' 'dzclass_Cancer'
 '