In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
rs=np.random.RandomState(42)
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LogisticRegressionCV, RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier,VotingClassifier
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score

In [2]:
df = pd.read_csv("data01.csv")

In [3]:
#df.info(verbose=False)

In [4]:
df = df.drop(columns="ID")
df = df.dropna(subset=["outcome"])
df["outcome"] = df["outcome"].astype("int")
df["EF"] = df["EF"].astype("float")
#df.info(verbose=False)

In [5]:
y = df["outcome"]
x = df.drop(columns="outcome")
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=rs,shuffle=True)
print(f"mortality ratio in train set: {y_train.sum()/len(y_train):.3f}")
print(f"mortality ratio in test set: {y_test.sum()/len(y_test):.3f}")

mortality ratio in train set: 0.127
mortality ratio in test set: 0.169


In [6]:
# resample the train dataset by simply oversampling the mortality data 
# TODO: use more complex approaches to augment the data e.g. LinearInterpolation, SMOTE, CTGAN 
yx_train = pd.concat([y_train,x_train],axis=1)
yx_train_pos = yx_train[yx_train["outcome"]==1]
yx_train_neg = yx_train[yx_train["outcome"]==0]
print(yx_train_pos.shape,yx_train_neg.shape)

(119, 50) (821, 50)


In [7]:
rs_y_train_set=[]
rs_x_train_set=[]
for n in range(1,10):
    oversampled_yx_train_pos = pd.concat([yx_train_pos for i in range(n)])
    resampled_yx_train = pd.concat([oversampled_yx_train_pos,yx_train_neg])
    rs_y_train = resampled_yx_train["outcome"]
    rs_x_train = resampled_yx_train.drop(columns="outcome")
    rs_y_train_set.append(rs_y_train)
    rs_x_train_set.append(pd.DataFrame(rs_x_train))

In [8]:
# TODO: try more preprocessing strategies and combinations
imputer1 = SimpleImputer(strategy='mean')
imputer2 = SimpleImputer(strategy='median')
imputer3 = SimpleImputer(strategy='most_frequent')
imputer4 = IterativeImputer(max_iter=10, random_state=rs)
imputer5 = KNNImputer(n_neighbors=2, weights="uniform")

scaler1 = StandardScaler()
scaler2 = MinMaxScaler(feature_range=(-1,1))

preprocess_pipe = make_column_transformer((make_pipeline(scaler1,imputer4), make_column_selector(dtype_include=np.float64)),
                                          ((make_pipeline(scaler2,imputer3), make_column_selector(dtype_include=np.int64))))

In [9]:
x_train_sample = rs_x_train_set[0] # ???
preprocess_pipe.fit(x_train_sample)
rs_x_train_set = [preprocess_pipe.transform(x_sample) for x_sample in rs_x_train_set]
x_test = preprocess_pipe.transform(pd.DataFrame(x_test))

In [10]:
def evaluate(model,x=x_test,y=y_test):
    y_pred = model.predict(x)
    print(classification_report(y,y_pred))

def evaluate_label_ratio(model,x=x_test,y=y_test):
    f1=[]
    rec=[]
    pre=[]
    for y_train,x_train in zip(rs_y_train_set,rs_x_train_set):
        model.fit(x_train,y_train)
        y_pred = model.predict(x)
        f1.append(f"{f1_score(y,y_pred):.3f}")
        rec.append(f"{recall_score(y,y_pred):.3f}")
        pre.append(f"{precision_score(y,y_pred):.3f}")
    print("f1",f1,max(f1,key=float))
    print("rec",rec)
    print("pre",pre)
import time
def statistic_evaluate(model0,os_ration=3,rs=rs,n=42):
    t=[]
    outcome=[]
    for i in range(n):
        model = eval(model0)
        t0=time.time()
        model.fit(rs_x_train_set[os_ration],rs_y_train_set[os_ration])
        t.append(time.time()-t0)
        y_pred = model.predict(x_test)
        outcome.append(f1_score(y_test,y_pred))
        if i==5 and np.std(outcome,ddof=1)<1e-3:
            print("early stop")
            break
    return np.mean(outcome),np.std(outcome,ddof=1),np.mean(t)

In [11]:
models={
       "LR1":'LogisticRegression(penalty="l1",solver="saga",random_state=rs)',
       "LR2":'LogisticRegression(penalty="l2",solver="saga",random_state=rs)',
       "LR3":'LogisticRegression(penalty="elasticnet",l1_ratio=0.5,solver="saga",random_state=rs)',
       "KNN1":'KNeighborsClassifier(n_neighbors=3,weights="uniform")',
       "KNN2":'KNeighborsClassifier(n_neighbors=7,weights="uniform")',
       "MLP1":'MLPClassifier([24],activation="tanh",solver="adam",alpha=1,random_state=rs,early_stopping=True,validation_fraction=0.2,n_iter_no_change=10)',
       "MLP2":'MLPClassifier([24,12],activation="tanh",solver="adam",alpha=1,random_state=rs,early_stopping=True,validation_fraction=0.2,n_iter_no_change=10)',
       "MLP3":'MLPClassifier([24,12,6],activation="tanh",solver="adam",alpha=1,random_state=rs,early_stopping=True,validation_fraction=0.2,n_iter_no_change=10)',
       "SVM1":'SVC(C=1.0,kernel="rbf",random_state=rs)',
       "SVM2":'SVC(C=1.0,kernel="linear",random_state=rs)',
       "SVM3":'SVC(C=1.0,kernel="poly",degree=3,random_state=rs)',
       "SVM4":'SVC(C=1.0,kernel="sigmoid",random_state=rs,)',
       "DT":'DecisionTreeClassifier(criterion="entropy",max_depth=18,min_samples_split=5,random_state=rs)',
       "RF":'RandomForestClassifier(n_estimators=30,max_depth=18,min_samples_split=5,random_state=rs)',
       "AdaB":'AdaBoostClassifier(estimator=LogisticRegression(random_state=rs),random_state=rs)',
       "Bag":'BaggingClassifier(estimator=LogisticRegression(random_state=rs),random_state=rs)',
       }


In [13]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.naive_bayes import CategoricalNB

outcomes={}
model='''VotingClassifier([("LR1",LogisticRegression(penalty="l2",solver="saga",random_state=rs)),
                          ("LR2",LogisticRegression(penalty="l2",solver="saga",random_state=rs)),
                          ("MLP1",MLPClassifier([24],activation="tanh",solver="adam",alpha=1,random_state=rs,early_stopping=True,validation_fraction=0.2,n_iter_no_change=10,max_iter=80)),
                          ("MLP2",MLPClassifier([24],activation="tanh",solver="adam",alpha=1,random_state=rs,early_stopping=True,validation_fraction=0.2,n_iter_no_change=10,max_iter=80)),
                          ("MLP3",MLPClassifier([24],activation="tanh",solver="adam",alpha=1,random_state=rs,early_stopping=True,validation_fraction=0.2,n_iter_no_change=10,max_iter=80)),
                          
                          ("SVM1",SVC(C=1.0,kernel="rbf",random_state=rs)),
                          ("SVM2",SVC(C=1.0,kernel="poly",degree=3,random_state=rs)),
                          ("SVM3",SVC(C=1.0,kernel="poly",degree=4,random_state=rs)),
                          ("NB",make_pipeline(KBinsDiscretizer(n_bins=5,encode="ordinal"),CategoricalNB()))
                          ]
                         )'''
m,s,tm=statistic_evaluate(model,os_ration=3,rs=rs,n=24)
print(m,s,tm)

early stop
0.5205479452054794 0.0 0.7701276540756226


In [None]:
m

In [None]:
tm