In [None]:
from SGShift import *
import pandas as pd
import numpy as np
source_df = pd.read_csv('../data/source_DiabetesReadmission.csv')
feature_names = source_df.columns.to_list()[:-1]
feature_names = np.array(feature_names)
source_X_raw = source_df[feature_names].values
source_y_raw = source_df['readmitted'].to_numpy().astype('int')
scaler = StandardScaler()
scaler.fit(source_X_raw)
source_X_raw = scaler.transform(source_X_raw)

target_df = pd.read_csv('../data/target_DiabetesReadmission.csv')
target_X = target_df[feature_names].values
target_y = target_df['readmitted'].to_numpy().astype('int')
scaler = StandardScaler()
scaler.fit(target_X)
target_X = scaler.transform(target_X)

In [2]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T, X_val, y_T, y_val = train_test_split(target_X, target_y, test_size=0.5, random_state=42)
lambdas = np.geomspace(0.0001, 0.05, num=30)
sel_thrs = np.concatenate([np.linspace(0.05, 0.25, 5), np.linspace(0.3, 0.5, 3), np.linspace(0.6, 1, 3)])
for solver_name in ['decision_tree', 'logistic_regression', 'svm', 'gb_classifier']:
    print(solver_name)
    solver = fit_model('classification', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)
    # print(source_only.coef_)
    target_only = deepcopy(solver)
    target_only.fit(X_T, y_T)
    # print(target_only.coef_)

    p_T = model2p(source_only, X_T)
    p_source_only = model2p(source_only, X_val)
    p_target_only = model2p(target_only, X_val)
    f_x_T = np.log(p_T / (1 - p_T))
    f_x_val = np.log(p_source_only / (1 - p_source_only))
    eval_results_source = evaluation(y_val, p_source_only)
    eval_results_target = evaluation(y_val, p_target_only)
    print('source_only', eval_results_source)
    print('target_only', eval_results_target)

    for lambda_reg in lambdas:
        delta_estimated = estimate_delta(X_T, y_T, lambda_reg, source_only)
        selected_naive = np.where(np.abs(delta_estimated) > 1e-4)[0]
        x_delta = X_val @ delta_estimated
        shifted_log_odds = f_x_val + x_delta
        p_val_naive = 1 / (1 + np.exp(-shifted_log_odds))
        eval_results_naive = evaluation(y_val, p_val_naive)
        print('naive', lambda_reg, eval_results_naive)
        print(len(selected_naive), feature_names[selected_naive], delta_estimated[selected_naive])
        if len(selected_naive) > 0:
            clf_poly = fit_nonparametric_classifier_with_offset(X_T, y_T, f_x_T, selected_naive, lambda_reg = lambda_reg,
                                                        degree=2, add_const=True)
            X_val_poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False).fit_transform(X_val[:, selected_naive])
            X_val_poly = sm.add_constant(X_val_poly, has_constant='add')
            p_val_new = clf_poly.predict(X_val_poly, offset=f_x_val)
            eval_results_naive_new = evaluation(y_val, p_val_new)
            print('naive_poly', lambda_reg, len(selected_naive), eval_results_naive_new)

decision_tree
source_only {'log_loss': 0.6636626850329255, 'AUC': 0.6267386861804033}
target_only {'log_loss': 0.6529972421150646, 'AUC': 0.6428231753674469}


naive 0.0001 {'log_loss': 0.6567981041485876, 'AUC': 0.6531097319550228}
31 ['gender' 'time_in_hospital' 'num_lab_procedures' 'num_procedures'
 'num_medications' 'number_outpatient' 'number_emergency'
 'number_inpatient' 'number_diagnoses' 'max_glu_serum' 'A1Cresult'
 'change' 'diabetesMed' 'race_Caucasian' 'race_Hispanic' 'race_Asian'
 'age>=70' 'max_glu_serum>200' 'max_glu_serum>300' 'A1Cresult>7'
 'A1Cresult>8' 'metformin_Up' 'metformin_Down' 'metformin_Steady'
 'insulin_Up' 'insulin_Down' 'insulin_Steady' 'Transferred' 'Home'
 'Emergency_admission' 'Elective_admission'] [ 0.00787843  0.03942828  0.06282306 -0.04358262 -0.12013021  0.07994654
  0.04966461 -0.02905539  0.16648578 -0.08909249 -0.1146547   0.0549816
  0.05917921  0.00737446 -0.03445294 -0.03823137 -0.02799743  0.1422415
 -0.01320393  0.08145354 -0.02655602  0.00445338  0.00028487 -0.06497785
 -0.00946608  0.05372059  0.01248747  0.01901747 -0.01901747  0.05697058
 -0.06267399]
naive_poly 0.0001 31 {'log_loss': 0.668987

In [None]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T, X_val, y_T, y_val = train_test_split(target_X, target_y, test_size=0.5, random_state=42)
lambdas = np.geomspace(0.0001, 0.05, num=30)
sel_thrs = np.concatenate([np.linspace(0.02, 0.28, 14), np.linspace(0.3, 0.5, 5), np.linspace(0.6, 1, 5)])
for solver_name in ['decision_tree', 'logistic_regression', 'svm', 'gb_classifier']:
    print(solver_name)
    solver = fit_model('classification', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)
    # print(source_only.coef_)
    target_only = deepcopy(solver)
    target_only.fit(X_T, y_T)
    # print(target_only.coef_)

    p_T = model2p(source_only, X_T)
    p_source_only = model2p(source_only, X_val)
    p_target_only = model2p(target_only, X_val)
    f_x_T = np.log(p_T / (1 - p_T))
    f_x_val = np.log(p_source_only / (1 - p_source_only))
    ll_ = [0.001, 0.002, 0.005, 0.01]
    pi = derandom_knock(solver, X_S, y_S, X_T, y_T, 50, ll_)
    for l_ in ll_:
        print(f'hyper = {l_}')
        for sel_thr in sel_thrs:
            selected_naive_knock = np.where(pi[l_] >= sel_thr)[0]
            if len(selected_naive_knock) > 0: 
                delta_estimated = estimate_delta(X_T[:, selected_naive_knock], y_T, l_, f_x = f_x_T)
                x_delta = X_val[:, selected_naive_knock] @ np.array(delta_estimated)
                shifted_log_odds = f_x_val + x_delta
            else: 
                shifted_log_odds = f_x_val
            p_val_naive_knock = 1 / (1 + np.exp(-shifted_log_odds))
            print('naive_knock', sel_thr, feature_names[selected_naive_knock], delta_estimated)

In [4]:
import warnings
warnings.filterwarnings("ignore")
lambdas = np.geomspace(0.0001, 0.05, num=30) / 5
sel_thrs = np.concatenate([np.linspace(0.05, 0.25, 5), np.linspace(0.3, 0.5, 3), np.linspace(0.6, 1, 3)])
for solver_name in ['decision_tree', 'logistic_regression', 'svm', 'gb_classifier']:
    print(solver_name)
    solver = fit_model('classification', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)

    for lambda_reg in lambdas:
        lambda_beta = 0.9 * lambda_reg
        beta_est, delta_estimated = estimate_beta_delta(X_S, y_S, X_T, y_T, lambda_beta, lambda_reg, source_only)
        selected = np.where(np.abs(delta_estimated) > 1e-4)[0]
        print('absorption', lambda_reg, len(selected), feature_names[selected], delta_estimated[selected])

decision_tree
absorption 2e-05 30 ['gender' 'time_in_hospital' 'num_lab_procedures' 'num_procedures'
 'num_medications' 'number_emergency' 'number_inpatient'
 'number_diagnoses' 'max_glu_serum' 'A1Cresult' 'change' 'diabetesMed'
 'race_Caucasian' 'race_Hispanic' 'race_Asian' 'age>=70'
 'max_glu_serum>200' 'max_glu_serum>300' 'A1Cresult>7' 'A1Cresult>8'
 'metformin_Up' 'metformin_Down' 'metformin_Steady' 'insulin_Up'
 'insulin_Down' 'insulin_Steady' 'Transferred' 'Home'
 'Emergency_admission' 'Elective_admission'] [ 0.03502125  0.01049858  0.11313375 -0.01576811 -0.17530816 -0.00688477
 -0.01510998  0.11427777 -0.1076481  -0.04300523  0.05084985 -0.06151879
 -0.02645209  0.01011256 -0.00908683  0.01191274  0.19644532 -0.11514178
  0.04667489 -0.03486242  0.03635356  0.00934933  0.00460258  0.03074029
  0.05930466  0.09796197  0.03507591 -0.03507591  0.09574761 -0.00863743]
absorption 2.4779807418841813e-05 30 ['gender' 'time_in_hospital' 'num_lab_procedures' 'num_procedures'
 'num_medic

In [None]:
import warnings
warnings.filterwarnings("ignore")
X_S = source_X_raw
y_S = source_y_raw
X_T, X_val, y_T, y_val = train_test_split(target_X, target_y, test_size=0.5, random_state=42)
sel_thrs = np.concatenate([np.linspace(0.02, 0.28, 14), np.linspace(0.3, 0.5, 5), np.linspace(0.6, 1, 5)])
for solver_name in ['decision_tree', 'logistic_regression', 'svm', 'gb_classifier']:
    print(solver_name)
    solver = fit_model('classification', solver_name, 42)
    source_only = deepcopy(solver)
    source_only.fit(X_S, y_S)

    p_T = model2p(source_only, X_T)
    p_source_only = model2p(source_only, X_val)
    f_x_T = np.log(p_T / (1 - p_T))
    f_x_val = np.log(p_source_only / (1 - p_source_only))
    ll_ = [0.0002, 0.0005, 0.001, 0.002]
    pi = derandom_knock_misspecified(solver, X_S, y_S, X_T, y_T, 50, ll_)
    for l_ in ll_:
        print(f'hyper = {l_}')
        beta_estimated, delta_estimated = estimate_beta_delta(X_S, y_S, X_T, y_T, 0.9*l_, l_, source_only)
        for sel_thr in sel_thrs:
            selected_knock_absorption = np.where(pi[l_] >= sel_thr)[0]
            if len(selected_knock_absorption) > 0: 
                delta_estimated = estimate_delta(X_T[:, selected_knock_absorption], y_T, l_, f_x = (f_x_T + X_T @ np.array(beta_estimated)))
            print('absorption_knock', sel_thr, feature_names[selected_knock_absorption], delta_estimated)