In [1]:
# import tensorflow as tf
import warnings

import numpy as np
import pandas as pd
import sklearn
import sklearn.ensemble
import sklearn.impute
import sklearn.neighbors
import sklearn.neural_network
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.svm
import sklearn.tree

warnings.filterwarnings("ignore")

rng = np.random.default_rng(seed=42)

In [2]:
df = pd.read_csv("data01.csv")
df[["group", "outcome"]] = df[["outcome", "group"]]

In [3]:
df.columns

Index(['group', 'ID', 'outcome', 'age', 'gendera', 'BMI', 'hypertensive',
       'atrialfibrillation', 'CHD with no MI', 'diabetes', 'deficiencyanemias',
       'depression', 'Hyperlipemia', 'Renal failure', 'COPD', 'heart rate',
       'Systolic blood pressure', 'Diastolic blood pressure',
       'Respiratory rate', 'temperature', 'SP O2', 'Urine output',
       'hematocrit', 'RBC', 'MCH', 'MCHC', 'MCV', 'RDW', 'Leucocyte',
       'Platelets', 'Neutrophils', 'Basophils', 'Lymphocyte', 'PT', 'INR',
       'NT-proBNP', 'Creatine kinase', 'Creatinine', 'Urea nitrogen',
       'glucose', 'Blood potassium', 'Blood sodium', 'Blood calcium',
       'Chloride', 'Anion gap', 'Magnesium ion', 'PH', 'Bicarbonate',
       'Lactic acid', 'PCO2', 'EF'],
      dtype='object')

In [4]:
data = df.drop("ID", axis=1).values  
data = np.delete(
    data, np.where(np.isnan(data[:, 0]))[0], axis=0
)  # del the data without label
rng.shuffle(data, axis=0)
train_x, train_y = (
    data[: int(0.8 * data.shape[0]), 1:],
    data[: int(0.8 * data.shape[0]), 0],
)
test_x, test_y = (
    data[int(0.8 * data.shape[0]) :, 1:],
    data[int(0.8 * data.shape[0]) :, 0],
)

In [5]:
print(train_x.shape, test_x.shape)

(940, 49) (236, 49)


In [6]:
imputer1 = sklearn.impute.SimpleImputer(strategy="median")
from sklearn.experimental import enable_iterative_imputer

imputer2 = sklearn.impute.IterativeImputer(max_iter=10, random_state=42)
imputer3 = sklearn.impute.KNNImputer(n_neighbors=2, weights="uniform")

scaler1 = sklearn.preprocessing.StandardScaler()
scaler2 = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1))

preprocess_pipe = sklearn.pipeline.make_pipeline(imputer1, scaler2)

In [7]:
_ = preprocess_pipe.fit(train_x)
train_x = preprocess_pipe.transform(train_x)
test_x = preprocess_pipe.transform(test_x)

In [8]:
from imblearn.over_sampling import ADASYN, SMOTE

smote = ADASYN()
# train_x,train_y = smote.fit_resample(train_x,train_y)

In [9]:
def evaluate(model, x, y, threshold=0.5):
    pred_y = model.predict(x).reshape(-1, 1)
    pred_y = (
        sklearn.preprocessing.Binarizer(threshold=threshold).transform(pred_y).flatten()
    )
    print("precision score:", sklearn.metrics.precision_score(y, pred_y))
    print("recall score:", sklearn.metrics.recall_score(y, pred_y))
    print("f1 score:", sklearn.metrics.f1_score(y, pred_y))

In [10]:
model1 = sklearn.linear_model.LogisticRegression()
model1.fit(train_x, train_y)
evaluate(model1, test_x, test_y)

precision score: 0.8181818181818182
recall score: 0.4090909090909091
f1 score: 0.5454545454545455


In [11]:
model2 = sklearn.linear_model.RidgeClassifier(alpha=0.0005)
model2.fit(train_x, train_y)
evaluate(model2, test_x, test_y, threshold=0.7)

precision score: 1.0
recall score: 0.36363636363636365
f1 score: 0.5333333333333333


In [12]:
model3 = sklearn.linear_model.Lasso(alpha=0.0005)
model3.fit(train_x, train_y)
evaluate(model3, test_x, test_y)

precision score: 1.0
recall score: 0.3181818181818182
f1 score: 0.4827586206896552


In [13]:
model4 = sklearn.linear_model.RANSACRegressor()
model4.fit(train_x, train_y)
evaluate(model4, test_x, test_y, threshold=0.5)

precision score: 0.0625
recall score: 0.09090909090909091
f1 score: 0.07407407407407407


In [14]:
model5 = sklearn.linear_model.SGDClassifier()
model5.fit(train_x, train_y)
evaluate(model5, train_x, train_y)
evaluate(model5, test_x, test_y)

precision score: 0.6883116883116883
recall score: 0.38686131386861317
f1 score: 0.4953271028037384
precision score: 0.42105263157894735
recall score: 0.36363636363636365
f1 score: 0.3902439024390244


In [15]:
model6 = sklearn.neural_network.MLPClassifier(
    [28], max_iter=280, activation="logistic", alpha=0.0002
)
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 0.75
recall score: 0.3284671532846715
f1 score: 0.4568527918781726
precision score: 0.6666666666666666
recall score: 0.36363636363636365
f1 score: 0.4705882352941177


In [16]:
model6 = sklearn.neural_network.MLPClassifier(
    [28], max_iter=200, activation="relu", alpha=0.0009
)
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 0.8390804597701149
recall score: 0.5328467153284672
f1 score: 0.6517857142857143
precision score: 0.5714285714285714
recall score: 0.36363636363636365
f1 score: 0.4444444444444444


In [17]:
model6 = sklearn.svm.SVC(kernel="rbf")
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 1.0
recall score: 0.13138686131386862
f1 score: 0.23225806451612904
precision score: 1.0
recall score: 0.13636363636363635
f1 score: 0.24000000000000002


In [18]:
model6 = sklearn.svm.SVC(kernel="linear")
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 0.8181818181818182
recall score: 0.26277372262773724
f1 score: 0.39779005524861877
precision score: 0.7777777777777778
recall score: 0.3181818181818182
f1 score: 0.45161290322580644


In [19]:
model6 = sklearn.svm.SVC(kernel="poly")
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 0.9821428571428571
recall score: 0.40145985401459855
f1 score: 0.5699481865284974
precision score: 0.7142857142857143
recall score: 0.22727272727272727
f1 score: 0.3448275862068965


In [20]:
model6 = sklearn.svm.SVC(kernel="sigmoid")
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 0.4074074074074074
recall score: 0.08029197080291971
f1 score: 0.13414634146341461
precision score: 0.42857142857142855
recall score: 0.13636363636363635
f1 score: 0.20689655172413793


In [21]:
model6 = sklearn.tree.DecisionTreeClassifier(
    criterion="gini", max_depth=16, min_samples_split=2
)
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 1.0
recall score: 0.9708029197080292
f1 score: 0.9851851851851852
precision score: 0.24242424242424243
recall score: 0.36363636363636365
f1 score: 0.2909090909090909


In [22]:
model6 = sklearn.tree.DecisionTreeClassifier(
    criterion="entropy", max_depth=6, min_samples_split=3
)
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 0.9540229885057471
recall score: 0.6058394160583942
f1 score: 0.7410714285714286
precision score: 0.36
recall score: 0.4090909090909091
f1 score: 0.3829787234042554


In [23]:
model6 = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1, weights="uniform")
model6.fit(train_x, train_y)
evaluate(model6, train_x, train_y)
evaluate(model6, test_x, test_y)

precision score: 1.0
recall score: 1.0
f1 score: 1.0
precision score: 0.4090909090909091
recall score: 0.4090909090909091
f1 score: 0.4090909090909091


In [24]:
emodel = sklearn.ensemble.AdaBoostClassifier(sklearn.linear_model.LogisticRegression())
emodel.fit(train_x, train_y)
evaluate(emodel, train_x, train_y)
evaluate(emodel, test_x, test_y)

precision score: 0.7551020408163265
recall score: 0.27007299270072993
f1 score: 0.3978494623655914
precision score: 0.75
recall score: 0.2727272727272727
f1 score: 0.39999999999999997


In [25]:
emodel = sklearn.ensemble.BaggingClassifier(sklearn.linear_model.LogisticRegression())
emodel.fit(train_x, train_y)
evaluate(emodel, train_x, train_y)
evaluate(emodel, test_x, test_y)

precision score: 0.8235294117647058
recall score: 0.30656934306569344
f1 score: 0.44680851063829785
precision score: 0.875
recall score: 0.3181818181818182
f1 score: 0.4666666666666667


In [26]:
emodel = sklearn.ensemble.AdaBoostClassifier(
    sklearn.linear_model.RidgeClassifier(alpha=0.0002), algorithm="SAMME"
)
emodel.fit(train_x, train_y)
evaluate(emodel, train_x, train_y)
evaluate(emodel, test_x, test_y)

precision score: 0.7727272727272727
recall score: 0.24817518248175183
f1 score: 0.37569060773480667
precision score: 1.0
recall score: 0.3181818181818182
f1 score: 0.4827586206896552


In [27]:
emodel = sklearn.ensemble.BaggingClassifier(
    sklearn.linear_model.RidgeClassifier(alpha=0.0002)
)
emodel.fit(train_x, train_y)
evaluate(emodel, train_x, train_y)
evaluate(emodel, test_x, test_y)

precision score: 0.8
recall score: 0.23357664233576642
f1 score: 0.36158192090395486
precision score: 1.0
recall score: 0.3181818181818182
f1 score: 0.4827586206896552


In [28]:
emodel = sklearn.ensemble.BaggingClassifier(
    sklearn.neural_network.MLPClassifier(
        [28], max_iter=240, activation="logistic", alpha=0.0002
    )
)
emodel.fit(train_x, train_y)
evaluate(emodel, train_x, train_y)
evaluate(emodel, test_x, test_y)

precision score: 0.7192982456140351
recall score: 0.29927007299270075
f1 score: 0.4226804123711341
precision score: 0.7272727272727273
recall score: 0.36363636363636365
f1 score: 0.4848484848484849


In [29]:
emodel = sklearn.ensemble.RandomForestClassifier(n_estimators=40, max_depth=18)
emodel.fit(train_x, train_y)
evaluate(emodel, train_x, train_y)
evaluate(emodel, test_x, test_y)

precision score: 1.0
recall score: 1.0
f1 score: 1.0
precision score: 1.0
recall score: 0.22727272727272727
f1 score: 0.37037037037037035
