In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
rs=np.random.RandomState(42)
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier, LogisticRegressionCV, RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score

<span style="color:orange">
    This is a base line model. Following works will be optimizations or researches of only one aspect of its settings.<br>
</span>

In [2]:
df = pd.read_csv("data01.csv")

In [3]:
df.info(verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Columns: 51 entries, group to EF
dtypes: float64(37), int64(14)
memory usage: 469.1 KB


In [4]:
df = df.drop(columns="ID")
df = df.dropna(subset=["outcome"])
df["outcome"] = df["outcome"].astype("int")
df["EF"] = df["EF"].astype("float")
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1176 entries, 0 to 1176
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   group                     1176 non-null   int64  
 1   outcome                   1176 non-null   int32  
 2   age                       1176 non-null   int64  
 3   gendera                   1176 non-null   int64  
 4   BMI                       962 non-null    float64
 5   hypertensive              1176 non-null   int64  
 6   atrialfibrillation        1176 non-null   int64  
 7   CHD with no MI            1176 non-null   int64  
 8   diabetes                  1176 non-null   int64  
 9   deficiencyanemias         1176 non-null   int64  
 10  depression                1176 non-null   int64  
 11  Hyperlipemia              1176 non-null   int64  
 12  Renal failure             1176 non-null   int64  
 13  COPD                      1176 non-null   int64  
 14  heart rate   

In [5]:
y = df["outcome"]
x = df.drop(columns="outcome")
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=rs,shuffle=True)
print(f"mortality ratio in train set: {y_train.sum()/len(y_train):.3f}")
print(f"mortality ratio in test set: {y_test.sum()/len(y_test):.3f}")

mortality ratio in train set: 0.127
mortality ratio in test set: 0.169


In [6]:
# resample the train dataset by simply oversampling the mortality data 
# TODO: use more complex approaches to augment the data e.g. LinearInterpolation, SMOTE, CTGAN 
yx_train = pd.concat([y_train,x_train],axis=1)
yx_train_pos = yx_train[yx_train["outcome"]==1]
yx_train_neg = yx_train[yx_train["outcome"]==0]
print(yx_train_pos.shape,yx_train_neg.shape)

(119, 50) (821, 50)


In [7]:
rs_y_train_set=[]
rs_x_train_set=[]
for n in range(1,10):
    oversampled_yx_train_pos = pd.concat([yx_train_pos for i in range(n)])
    resampled_yx_train = pd.concat([oversampled_yx_train_pos,yx_train_neg])
    rs_y_train = resampled_yx_train["outcome"]
    rs_x_train = resampled_yx_train.drop(columns="outcome")
    rs_y_train_set.append(rs_y_train)
    rs_x_train_set.append(pd.DataFrame(rs_x_train))

In [8]:
# TODO: try more preprocessing strategies and combinations
imputer1 = SimpleImputer(strategy='mean')
imputer2 = SimpleImputer(strategy='median')
imputer3 = SimpleImputer(strategy='most_frequent')
imputer4 = IterativeImputer(max_iter=10, random_state=rs)
imputer5 = KNNImputer(n_neighbors=2, weights="uniform")

scaler1 = StandardScaler()
scaler2 = MinMaxScaler(feature_range=(-1,1))

preprocess_pipe = make_column_transformer((make_pipeline(scaler1,imputer4), make_column_selector(dtype_include=np.float64)),
                                          ((make_pipeline(scaler2,imputer3), make_column_selector(dtype_include=np.int64))))

In [9]:
x_train_sample = rs_x_train_set[0] # ???
preprocess_pipe.fit(x_train_sample)
rs_x_train_set = [preprocess_pipe.transform(x_sample) for x_sample in rs_x_train_set]
x_test = preprocess_pipe.transform(pd.DataFrame(x_test))

In [10]:
def evaluate(model,x=x_test,y=y_test):
    y_pred = model.predict(x)
    print(classification_report(y,y_pred))

def evaluate_label_ratio(model,x=x_test,y=y_test):
    f1=[]
    rec=[]
    pre=[]
    for y_train,x_train in zip(rs_y_train_set,rs_x_train_set):
        model.fit(x_train,y_train)
        y_pred = model.predict(x)
        f1.append(f"{f1_score(y,y_pred):.3f}")
        rec.append(f"{recall_score(y,y_pred):.3f}")
        pre.append(f"{precision_score(y,y_pred):.3f}")
    print("f1",f1,max(f1,key=float))
    print("rec",rec)
    print("pre",pre)

def statistic_evaluate(model0,kwargs,os_ration=3,rs=rs,n=42):
    outcome=[]
    for _ in range(n):
        model = model0(random_state=rs,**kwargs)
        model.fit(rs_x_train_set[os_ration],rs_y_train_set[os_ration])
        y_pred = model.predict(x_test)
        outcome.append(f1_score(y_test,y_pred))
    return np.mean(outcome),np.std(outcome,ddof=1)

In [11]:
baseline_model = LogisticRegression
baseline_kwargs={'penalty':"l2",'solver':"saga"}

In [12]:
m,s=statistic_evaluate(baseline_model,baseline_kwargs,os_ration=3,rs=rs,n=42)

In [13]:
bmodel=LogisticRegression(penalty="l2",class_weight=None,solver="saga",random_state=rs)
bmodel.fit(rs_x_train_set[3],rs_y_train_set[3])
evaluate(bmodel)

              precision    recall  f1-score   support

           0       0.91      0.84      0.88       196
           1       0.44      0.60      0.51        40

    accuracy                           0.80       236
   macro avg       0.67      0.72      0.69       236
weighted avg       0.83      0.80      0.81       236



In [14]:
base_pred = bmodel.predict(x_test)

In [15]:
model = MLPClassifier([32,16],activation="tanh",solver="adam",alpha=0.0003,max_iter=200,random_state=42)
model.fit(rs_x_train_set[3],rs_y_train_set[3])
evaluate(model,x=x_test,y=y_test)
evaluate(model,x=x_test,y=base_pred)

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       196
           1       0.56      0.50      0.53        40

    accuracy                           0.85       236
   macro avg       0.73      0.71      0.72       236
weighted avg       0.84      0.85      0.84       236

              precision    recall  f1-score   support

           0       0.88      0.97      0.92       181
           1       0.86      0.56      0.68        55

    accuracy                           0.88       236
   macro avg       0.87      0.77      0.80       236
weighted avg       0.88      0.88      0.87       236



In [16]:
model = LogisticRegression(penalty="elasticnet",l1_ratio=0.8,solver="saga",random_state=rs)
model.fit(rs_x_train_set[3],rs_y_train_set[3])
evaluate(model,x=x_test,y=y_test)
evaluate(model,x=x_test,y=base_pred)

              precision    recall  f1-score   support

           0       0.91      0.84      0.87       196
           1       0.43      0.57      0.49        40

    accuracy                           0.80       236
   macro avg       0.67      0.71      0.68       236
weighted avg       0.83      0.80      0.81       236

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       181
           1       0.98      0.96      0.97        55

    accuracy                           0.99       236
   macro avg       0.99      0.98      0.98       236
weighted avg       0.99      0.99      0.99       236

