In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
rs=42 # np.random.RandomState(42)
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer,KNNImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("data01.csv")

In [3]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   group                     1177 non-null   int64  
 1   ID                        1177 non-null   int64  
 2   outcome                   1176 non-null   float64
 3   age                       1177 non-null   int64  
 4   gendera                   1177 non-null   int64  
 5   BMI                       962 non-null    float64
 6   hypertensive              1177 non-null   int64  
 7   atrialfibrillation        1177 non-null   int64  
 8   CHD with no MI            1177 non-null   int64  
 9   diabetes                  1177 non-null   int64  
 10  deficiencyanemias         1177 non-null   int64  
 11  depression                1177 non-null   int64  
 12  Hyperlipemia              1177 non-null   int64  
 13  Renal failure             1177 non-null   int64  
 14  COPD    

In [4]:
df = df.drop(columns="ID")
df = df.dropna(subset=["outcome"])
df["outcome"] = df["outcome"].astype("int")
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 1176 entries, 0 to 1176
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   group                     1176 non-null   int64  
 1   outcome                   1176 non-null   int32  
 2   age                       1176 non-null   int64  
 3   gendera                   1176 non-null   int64  
 4   BMI                       962 non-null    float64
 5   hypertensive              1176 non-null   int64  
 6   atrialfibrillation        1176 non-null   int64  
 7   CHD with no MI            1176 non-null   int64  
 8   diabetes                  1176 non-null   int64  
 9   deficiencyanemias         1176 non-null   int64  
 10  depression                1176 non-null   int64  
 11  Hyperlipemia              1176 non-null   int64  
 12  Renal failure             1176 non-null   int64  
 13  COPD                      1176 non-null   int64  
 14  heart rate   

In [5]:
y = df["outcome"]
x = df.drop(columns="outcome")
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=rs,shuffle=True)
print(f"mortality rate in train set: {y_train.sum()/len(y_train):.3f}")
print(f"mortality rate in test set: {y_test.sum()/len(y_test):.3f}")

mortality rate in train set: 0.127
mortality rate in test set: 0.169


In [6]:
# TODO: visualize the data distribution using PCA or AutoEncoder

In [7]:
# resample the train dataset by simply oversampling the mortality data 
# TODO: use more complex approaches to augment the data e.g. LinearInterpolation, SMOTE, CTGAN 
yx_train = pd.concat([y_train,x_train],axis=1)
yx_train_pos = yx_train[yx_train["outcome"]==1]
yx_train_neg = yx_train[yx_train["outcome"]==0]
print(yx_train_pos.shape,yx_train_neg.shape)

(119, 50) (821, 50)


In [8]:
oversampled_yx_train_pos = pd.concat([yx_train_pos for i in range(5)])
resampled_yx_train = pd.concat([oversampled_yx_train_pos,yx_train_neg])
rs_y_train = resampled_yx_train["outcome"]
rs_x_train = resampled_yx_train.drop(columns="outcome")
print(f"mortality rate in resampled train set: {rs_y_train.sum()/len(rs_y_train):.3f}")

mortality rate in resampled train set: 0.420


In [9]:
y_train,x_train = rs_y_train,rs_x_train

In [10]:
# TODO: try more preprocessing strategies and combinations
imputer1 = SimpleImputer(strategy='median')
imputer2 = IterativeImputer(max_iter=10, random_state=rs)
imputer3 = KNNImputer(n_neighbors=2, weights="uniform")

scaler1 = StandardScaler()
scaler2 = MinMaxScaler(feature_range=(0,1))

preprocess_pipe = make_pipeline(imputer1,scaler2)

In [11]:
preprocess_pipe.fit(x_train)
x_train = preprocess_pipe.transform(x_train)
x_test = preprocess_pipe.transform(x_test)

In [12]:
def evaluate(model,x=x_test,y=y_test):
    y_pred = model.predict(x)
    print(classification_report(y,y_pred))

In [13]:
# level 1 models (they work independently to each other)
# TODO: optimize the models modifying hyperparams / using customized training data
# TODO: try other models
models={
       "LR1":LogisticRegression(penalty="l1",class_weight=None,solver="saga",random_state=rs),
       "LR2":LogisticRegression(penalty="l2",class_weight=None,solver="saga",random_state=rs),
       "LR3":LogisticRegression(penalty="elasticnet",l1_ratio=0.1,class_weight=None,solver="saga",random_state=rs),
       "RC":RidgeClassifier(alpha=0.0005,random_state=rs),
       "KNC":KNeighborsClassifier(n_neighbors=1,weights="uniform"),
       #"RNC":RadiusNeighborsClassifier(radius=10,weights="uniform"),
       "MLPC1":MLPClassifier([25],activation="logistic",solver="adam",alpha=0.0001,max_iter=200,random_state=rs),
       "MLPC2":MLPClassifier([32],activation="logistic",solver="adam",alpha=0.0003,max_iter=200,random_state=rs),
       "SVC1":SVC(kernel="rbf",random_state=rs),
       "SVC2":SVC(kernel="linear",random_state=rs),
       "SVC3":SVC(kernel="poly",random_state=rs),
       "SVC4":SVC(kernel="sigmoid",random_state=rs),
       "DTC1":DecisionTreeClassifier(criterion="gini",max_depth=18,min_samples_split=2,random_state=rs),
       "DTC2":DecisionTreeClassifier(criterion="entropy",max_depth=18,min_samples_split=2,random_state=rs),
       "ABC":AdaBoostClassifier(estimator=None,random_state=rs),
       "BC":BaggingClassifier(estimator=None,random_state=rs),
       "RFC":RandomForestClassifier(max_depth=18,random_state=rs),
       "HGBC":HistGradientBoostingClassifier(random_state=rs)
       }

In [14]:
# TODO: adopt level 2 model to build ensemble models e.g. Stacking

In [15]:
# TODO: visualizing model contribution / importances
# TODO(?): drop some unimportant / bad-behaving models, and train the ensemble model again

In [16]:
# TODO: visualizing feature importances
# TODO(?): drop some unimportant features according to the visualization, and train the models again

In [17]:
for name,model in models.items():
    print(name)
    model.fit(x_train,y_train)
    evaluate(model,x_test,y_test)
    print("")

LR1
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       196
           1       0.45      0.60      0.52        40

    accuracy                           0.81       236
   macro avg       0.68      0.73      0.70       236
weighted avg       0.83      0.81      0.82       236


LR2
              precision    recall  f1-score   support

           0       0.91      0.86      0.88       196
           1       0.45      0.57      0.51        40

    accuracy                           0.81       236
   macro avg       0.68      0.72      0.69       236
weighted avg       0.83      0.81      0.82       236


LR3
              precision    recall  f1-score   support

           0       0.91      0.86      0.88       196
           1       0.45      0.57      0.51        40

    accuracy                           0.81       236
   macro avg       0.68      0.72      0.69       236
weighted avg       0.83      0.81      0.82       236


RC
 