In [3]:
import numpy as np
import pandas as pd
from econml.sklearn_extensions.model_selection import GridSearchCVList
from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
from econml.metalearners import SLearner, TLearner
from econml.dr import DRLearner
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklift.viz import plot_qini_curve, plot_uplift_curve
from sklift.metrics import qini_auc_score, uplift_auc_score
import matplotlib.pyplot as plt
from nb21 import elast, cumulative_gain
import seaborn as sns

# Load dataset
data = pd.read_csv('../data/bsc_project_set.csv')
data = data.drop(['id', 'Unnamed: 0', 'peep'], axis=1)

# Convert categorical data into numeric
categorical_columns = ['sex', 'peep_regime', 'mort_28']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)
data = data.drop(['sex_F', 'peep_regime_low', 'mort_28_True'], axis=1)

numeric_columns = data.columns.difference(['mort_28_False'])
scaler = MinMaxScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

imputer = KNNImputer(n_neighbors=11, weights='uniform')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Define treatment and outcome columns
treatment_column = 'peep_regime_high'
outcome_column = 'mort_28_False'

# Define features (excluding treatment and outcome)
features = list(set(data.columns) - {treatment_column, outcome_column})
selected_features = ['age', 'urea', 'weight', 'pf_ratio', 'po2', 'ph', 'fio2', 'driving_pressure', 'plateau_pressure']

X = data[selected_features]
Y = data[outcome_column]
T = data[treatment_column]

In [6]:
# Function to evaluate the S-Learner and T-Learner with given models
def eval_learner(learner, X, Y, T, selected_features, model_name, n_splits=10):
    results = {'train': [], 'test': []}
    
    for i in range(n_splits):
        print(f"Iteration {i} for {model_name}")
        
        X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, Y, T, test_size=0.2, random_state=i*768)

        learner.fit(Y=y_train.astype(int), T=t_train, X=X_train)

        cate_train = learner.effect(X=X_train)
        cate_test = learner.effect(X=X_test)
        
        data_train = pd.DataFrame(X_train, columns=selected_features)
        data_train['outcome'] = y_train
        data_train['treatment'] = t_train
        data_train['CATE'] = cate_train
        
        data_test = pd.DataFrame(X_test, columns=selected_features)
        data_test['outcome'] = y_test
        data_test['treatment'] = t_test
        data_test['CATE'] = cate_test
        
        qini_auc_train = qini_auc_score(data_train['outcome'], data_train['CATE'], data_train['treatment'])
        qini_auc_test = qini_auc_score(data_test['outcome'], data_test['CATE'], data_test['treatment'])
        
        uplift_auc_train = uplift_auc_score(data_train['outcome'], data_train['CATE'], data_train['treatment'])
        uplift_auc_test = uplift_auc_score(data_test['outcome'], data_test['CATE'], data_test['treatment'])
        
        results['train'].append({'qini_auc': qini_auc_train, 'uplift_auc': uplift_auc_train})
        results['test'].append({'qini_auc': qini_auc_test, 'uplift_auc': uplift_auc_test})

        # Visualize Qini Curve Test
        fig, ax = plt.subplots(1, 1)
        ax.set_title(f'Qini curve test - {model_name}')
        plot_qini_curve(data_test['outcome'], data_test['CATE'], data_test['treatment'], perfect=False, name='S/T-Learner', ax=ax)
        plt.legend(loc='best')
        plt.savefig(f'plots/ST/Qini_curve_test_{model_name}_iter_{i}.png', bbox_inches='tight')
        plt.savefig(f'plots/ST/Qini_curve_test_{model_name}_iter_{i}.svg', bbox_inches='tight')
        plt.show()

        # Visualize Qini Curve Train
        fig, ax = plt.subplots(1, 1)
        ax.set_title(f'Qini curve train - {model_name}')
        plot_qini_curve(data_train['outcome'], data_train['CATE'], data_train['treatment'], perfect=False, name='S/T-Learner', ax=ax)
        plt.legend(loc='best')
        plt.savefig(f'plots/ST/Qini_curve_train_{model_name}_iter_{i}.png', bbox_inches='tight')
        plt.savefig(f'plots/ST/Qini_curve_train_{model_name}_iter_{i}.svg', bbox_inches='tight')
        plt.show()

        # Visualize Uplift Curve Test
        fig, ax = plt.subplots(1, 1)
        ax.set_title(f'Uplift curve test - {model_name}')
        plot_uplift_curve(data_test['outcome'], data_test['CATE'], data_test['treatment'], perfect=False, name='S/T-Learner', ax=ax)
        plt.legend(loc='best')
        plt.savefig(f'plots/ST/Uplift_curve_test_{model_name}_iter_{i}.png', bbox_inches='tight')
        plt.savefig(f'plots/ST/Uplift_curve_test_{model_name}_iter_{i}.svg', bbox_inches='tight')
        plt.show()

        # Visualize Uplift Curve Train
        fig, ax = plt.subplots(1, 1)
        ax.set_title(f'Uplift curve train - {model_name}')
        plot_uplift_curve(data_train['outcome'], data_train['CATE'], data_train['treatment'], perfect=False, name='S/T-Learner', ax=ax)
        plt.legend(loc='best')
        plt.savefig(f'plots/ST/Uplift_curve_train_{model_name}_iter_{i}.png', bbox_inches='tight')
        plt.savefig(f'plots/ST/Uplift_curve_train_{model_name}_iter_{i}.svg', bbox_inches='tight')
        plt.show()
        
        train_data = pd.concat([X_train, y_train, t_train], axis=1)
        test_data = pd.concat([X_test, y_test, t_test], axis=1)

        gain_curve_dr_lr_train = cumulative_gain(train_data.assign(cate=cate_train), "cate", y=outcome_column, t=treatment_column)
        gain_curve_dr_lr_test = cumulative_gain(test_data.assign(cate=cate_test), "cate", y=outcome_column, t=treatment_column)


        # plt.figure(dpi=200)
        plt.plot(gain_curve_dr_lr_test, color="C4", label="DR-Learner Test")
        plt.plot(gain_curve_dr_lr_train, color="C5", label="DR-Learner Train")
        plt.plot([0, 100], [0, elast(data, outcome_column, treatment_column)], linestyle="--", color="black", label="Baseline")
        plt.legend(loc='best')
        plt.title(f"Cumulative gain - {model_name}")
        # plots/swapped/mimic/
        plt.savefig(f'plots/ST/Cumulative_gain_curve_train_{model_name}_iter_{i+1}.png', bbox_inches='tight')
        plt.savefig(f'plots/ST/Cumulative_gain_curve_train_{model_name}_iter_{i+1}.svg', bbox_inches='tight')
        plt.show()

    return results

In [7]:
# S-Learner with Gradient Boosting
s_learner_gb = SLearner(overall_model=GradientBoostingRegressor(n_estimators=100, random_state=768))
print("S-Learner Gradient Boosting training")
s_learner_gb_results = eval_learner(s_learner_gb, X, Y, T, selected_features, "S-Learner Gradient Boosting")

In [8]:
# S-Learner with Linear Regression
s_learner_lr = SLearner(overall_model=LinearRegression())
print("S-Learner Linear Regression training")
s_learner_lr_results = eval_learner(s_learner_lr, X, Y, T, selected_features, "S-Learner Linear Regression")

In [9]:
# T-Learner with Gradient Boosting
t_learner_gb = TLearner(models=GradientBoostingRegressor(n_estimators=100, random_state=768))
print("T-Learner Gradient Boosting training")
t_learner_gb_results = eval_learner(t_learner_gb, X, Y, T, selected_features, "T-Learner Gradient Boosting")

In [10]:
# T-Learner with Linear Regression
t_learner_lr = TLearner(models=LinearRegression())
print("T-Learner Linear Regression training")
t_learner_lr_results = eval_learner(t_learner_lr, X, Y, T, selected_features, "T-Learner Linear Regression")

In [11]:
# Print results for S-Learner and T-Learner
def print_results(model_name, results):
    avg_train_qini_auc = np.mean([r['qini_auc'] for r in results['train']])
    avg_test_qini_auc = np.mean([r['qini_auc'] for r in results['test']])
    avg_train_uplift_auc = np.mean([r['uplift_auc'] for r in results['train']])
    avg_test_uplift_auc = np.mean([r['uplift_auc'] for r in results['test']])

    print(f"Results for {model_name}:")
    print(f"Average Train Qini AUC: {avg_train_qini_auc:.6f}, Average Test Qini AUC: {avg_test_qini_auc:.6f}")
    print(f"Average Train Uplift AUC: {avg_train_uplift_auc:.6f}, Average Test Uplift AUC: {avg_test_uplift_auc:.6f}\n")



In [12]:
# Print all results
print_results("S-Learner Gradient Boosting", s_learner_gb_results)
print_results("S-Learner Linear Regression", s_learner_lr_results)
print_results("T-Learner Gradient Boosting", t_learner_gb_results)
print_results("T-Learner Linear Regression", t_learner_lr_results)