In [2]:
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from econml.metalearners import SLearner, TLearner
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from econml.dr import DRLearner
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Load dataset
data = pd.read_csv('../data/bsc_project_set.csv')
data = data.drop(['id', 'Unnamed: 0', 'map', 'bilirubin', 'creatinine', 'platelets', 'urea', 'diastolic_blood_pressure'], axis=1)
available_columns = ['age','weight','height','pf_ratio','po2','pco2','ph','driving_pressure','lung_compliance','fio2','hco3','heart_rate','minute_volume','peep','plateau_pressure','respiratory_rate','syst_blood_pressure', 'mort_28', 'sex', 'peep_regime']
data = data[available_columns]
categorical_columns = ['sex', 'peep_regime']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)
data = data.drop(['sex_F','peep_regime_low'], axis = 1)
numeric_columns = data.columns
impute_columns = data.columns.difference(['mort_28', 'sex_M', 'peep_regime_high'])

# Normalize data
scaler = MinMaxScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
# Impute missing data
imputer = KNNImputer(n_neighbors=11, weights='uniform')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Define treatment and outcome columns
treatment_column = 'peep_regime_high'
outcome_column = 'mort_28'

selected_features = ['age', 'weight', 'pf_ratio', 'po2', 'ph', 'fio2', 'driving_pressure', 'plateau_pressure']
X = data[selected_features]
Y = data[outcome_column]
T = data[treatment_column]

# Train and save S-learner with Gradient Boosting
s_learner_gb = SLearner(overall_model=GradientBoostingRegressor(n_estimators=100, random_state=768))
s_learner_gb.fit(Y=Y.astype(int), T=T, X=X)
with open('s_learner_gb_model.pkl', 'wb') as f:
    pickle.dump(s_learner_gb, f)

# Train and save S-learner with Linear Regression
s_learner_lr = SLearner(overall_model=LinearRegression())
s_learner_lr.fit(Y=Y.astype(int), T=T, X=X)
with open('s_learner_lr_model.pkl', 'wb') as f:
    pickle.dump(s_learner_lr, f)

# Train and save T-learner with Gradient Boosting
t_learner_gb = TLearner(models=GradientBoostingRegressor(n_estimators=100, random_state=768))
t_learner_gb.fit(Y.astype(int), T=T, X=X)
with open('t_learner_gb_model.pkl', 'wb') as f:
    pickle.dump(t_learner_gb, f)

# Train and save T-learner with Linear Regression
t_learner_lr = TLearner(models=LinearRegression())
t_learner_lr.fit(Y.astype(int), T=T, X=X)
with open('t_learner_lr_model.pkl', 'wb') as f:
    pickle.dump(t_learner_lr, f)

# Train and save DR-learners with calibrated propensity models
def save_dr_learner(dr_learner, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dr_learner, f)

# Fit and calibrate propensity models
calibrated_propensity_models = {
    'svm': CalibratedClassifierCV(SVC(C=1.0, kernel='linear', probability=True, random_state=768), method='isotonic', cv=5),
    'logreg': CalibratedClassifierCV(LogisticRegression(C=10.0, solver='lbfgs', random_state=768), method='isotonic', cv=5),
    'gb': CalibratedClassifierCV(GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=50, random_state=768), method='isotonic', cv=5),
    'knn': CalibratedClassifierCV(KNeighborsClassifier(n_neighbors=7), method='isotonic', cv=5)
}

for name, model in calibrated_propensity_models.items():
    model.fit(X, T)
    with open(f'calibrated_propensity_model_{name}.pkl', 'wb') as f:
        pickle.dump(model, f)

# DR-learners
dr_learners = {
    'svm_xgb_xgb': DRLearner(
        model_propensity=calibrated_propensity_models['svm'],
        model_regression=XGBClassifier(learning_rate=0.01, max_depth=3, n_estimators=50, random_state=768),
        model_final=XGBRegressor(n_estimators=30, random_state=768),
        discrete_outcome=True,
        cv=3
    ),
    'svm_logreg_xgb': DRLearner(
        model_propensity=calibrated_propensity_models['svm'],
        model_regression=LogisticRegression(C=1.0, solver='liblinear', random_state=768),
        model_final=XGBRegressor(n_estimators=30, random_state=768),
        discrete_outcome=True,
        cv=3
    ),
    'svm_gb_xgb': DRLearner(
        model_propensity=calibrated_propensity_models['svm'],
        model_regression=GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=50, random_state=768),
        model_final=XGBRegressor(n_estimators=30, random_state=768),
        discrete_outcome=True,
        cv=3
    ),
    'logreg_logreg_linear': DRLearner(
        model_propensity=calibrated_propensity_models['logreg'],
        model_regression=LogisticRegression(C=1.0, solver='liblinear', random_state=768),
        model_final=LinearRegression(),
        discrete_outcome=True,
        cv=3
    ),
    'gb_gb_gb': DRLearner(
        model_propensity=calibrated_propensity_models['gb'],
        model_regression=GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=50, random_state=768),
        model_final=GradientBoostingRegressor(n_estimators=30, random_state=768),
        discrete_outcome=True,
        cv=3
    ),
    'knn_gb_gb': DRLearner(
        model_propensity=calibrated_propensity_models['knn'],
        model_regression=GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=50, random_state=768),
        model_final=GradientBoostingRegressor(n_estimators=30, random_state=768),
        discrete_outcome=True,
        cv=3
    )
}

for name, learner in dr_learners.items():
    learner.fit(Y=Y.astype(int), T=T, X=X, W=None)
    save_dr_learner(learner, f'dr_learner_{name}.pkl')

# Save preprocessing pipeline and column order
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

with open('column_order.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please cha