In [1]:
import numpy as np
import os
import random
import pandas as pd
from sklearn.metrics import accuracy_score
from joblib import load
from sklearn.metrics import classification_report , matthews_corrcoef

In [2]:
np.random.seed(42)
random.seed(42)

In [3]:
df = pd.read_csv('processed.cleveland.data' , header=None)
all_columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
df.columns = all_columns
df = df[~df.isin(['?']).any(axis=1)]

true_labels = df['target'].values
input_columns = df.columns.drop('target')

# Raw

In [4]:
raw_models = {
    'binary': load('raw_models/model_binary.pkl'),
    '14_23': load('raw_models/model_14_23.pkl'),
    '1_4': load('raw_models/model_1_4.pkl'),
    '2_3': load('raw_models/model_2_3.pkl'),
}

raw_scalers = {
    'binary': load('raw_scaler/scaler_binary.pkl'),
    '14_23': load('raw_scaler/scaler_14_23.pkl'),
    '1_4': load('raw_scaler/scaler_1_4.pkl'),
    '2_3': load('raw_scaler/scaler_2_3.pkl'),
}

raw_features_dict = {
    'binary': ['thal', 'exang', 'thalach', 'ca', 'chol', 'slope', 'fbs', 'sex', 'restecg', 'age', 'oldpeak'],
    '1_4': ['sex', 'slope', 'fbs', 'thalach', 'trestbps', 'restecg', 'thal', 'cp', 'oldpeak', 'exang'],
    '2_3': ['trestbps', 'slope', 'oldpeak'],
    '14_23': ['trestbps', 'slope', 'restecg', 'sex', 'fbs', 'thal', 'age', 'oldpeak'],
}



# Smote

In [5]:
smote_models = {
    'binary': load('smote_models/model_binary.pkl'),
    '14_23': load('smote_models/model_14_23.pkl'),
    '1_4': load('smote_models/model_1_4.pkl'),
    '2_3': load('smote_models/model_2_3.pkl'),
}

smote_scalers = {
    'binary': load('smote_scaler/scaler_binary.pkl'),
    '14_23': load('smote_scaler/scaler_14_23.pkl'),
    '1_4': load('smote_scaler/scaler_1_4.pkl'),
    '2_3': load('smote_scaler/scaler_2_3.pkl'),
}

smote_features_dict = {
    'binary': ['fbs', 'slope', 'ca', 'restecg', 'exang', 'sex', 'thalach', 'chol'],
    '1_4': ['chol', 'fbs', 'thal', 'restecg', 'ca', 'sex', 'exang', 'age', 'oldpeak','trestbps', 'thalach', 'cp', 'slope'],
    '2_3': ['oldpeak', 'thal', 'cp', 'fbs', 'chol', 'age', 'ca', 'sex','slope', 'trestbps', 'restecg'],
    '14_23': ['ca', 'cp', 'slope', 'fbs', 'age', 'restecg', 'trestbps'],
}

# Adasyn

In [6]:
adasyn_models = {
    'binary': load('adasyn_models/model_binary.pkl'),
    '14_23': load('adasyn_models/model_14_23.pkl'),
    '1_4': load('adasyn_models/model_1_4.pkl'),
    '2_3': load('adasyn_models/model_2_3.pkl'),
}

adasyn_scalers = {
    'binary': load('adasyn_scaler/scaler_binary.pkl'),
    '14_23': load('adasyn_scaler/scaler_14_23.pkl'),
    '1_4': load('adasyn_scaler/scaler_1_4.pkl'),
    '2_3': load('adasyn_scaler/scaler_2_3.pkl'),
}

adasyn_features_dict = {
    'binary': ['slope', 'thalach', 'chol', 'ca', 'restecg', 'fbs', 'sex', 'age', 'exang'],
    '1_4': ['slope', 'oldpeak', 'restecg', 'age', 'trestbps', 'cp', 'fbs'],
    '2_3': ['oldpeak', 'sex', 'fbs', 'slope', 'restecg', 'trestbps', 'age'],
    '14_23': ['ca', 'chol', 'oldpeak', 'trestbps', 'thal', 'fbs', 'restecg', 'age', 'cp', 'exang', 'slope'],
}

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Smote Tomek

In [7]:
smote_tomek_models = {
    'binary': load('smote_tomek_models/model_binary.pkl'),
    '14_23': load('smote_tomek_models/model_14_23.pkl'),
    '1_4': load('smote_tomek_models/model_1_4.pkl'),
    '2_3': load('smote_tomek_models/model_2_3.pkl'),
}

smote_tomek_scalers = {
    'binary': load('smote_tomek_scaler/scaler_binary.pkl'),
    '14_23': load('smote_tomek_scaler/scaler_14_23.pkl'),
    '1_4': load('smote_tomek_scaler/scaler_1_4.pkl'),
    '2_3': load('smote_tomek_scaler/scaler_2_3.pkl'),
}

smote_tomek_features_dict = {
    'binary': ['thalach', 'sex', 'fbs', 'exang', 'restecg', 'oldpeak'],
    '1_4': ['trestbps', 'cp', 'ca', 'oldpeak', 'chol', 'slope', 'thal', 'exang'],
    '2_3': ['exang', 'restecg', 'trestbps', 'slope'],
    '14_23': ['chol', 'cp', 'exang', 'thal', 'age', 'oldpeak', 'fbs', 'slope', 'sex', 'ca', 'trestbps', 'thalach'],
}



# Kmeans Smote

In [8]:
kmeans_smote_models = {
    'binary': load('kmeans_smote_models/model_binary.pkl'),
    '14_23': load('kmeans_smote_models/model_14_23.pkl'),
    '1_4': load('kmeans_smote_models/model_1_4.pkl'),
    '2_3': load('kmeans_smote_models/model_2_3.pkl'),
}

kmeans_smote_scalers = {
    'binary': load('kmeans_smote_scaler/scaler_binary.pkl'),
    '14_23': load('kmeans_smote_scaler/scaler_14_23.pkl'),
    '1_4': load('kmeans_smote_scaler/scaler_1_4.pkl'),
    '2_3': load('kmeans_smote_scaler/scaler_2_3.pkl'),
}

kmeans_smote_features_dict = {
    'binary': ['age', 'ca', 'sex', 'exang', 'thalach', 'restecg', 'chol', 'thal'],
    '1_4': ['oldpeak', 'fbs', 'age', 'thalach', 'restecg', 'trestbps', 'exang', 'ca', 'cp', 'chol'],
    '2_3': ['slope', 'restecg', 'fbs', 'trestbps'],
    '14_23': ['ca', 'cp', 'fbs', 'chol', 'restecg', 'thal', 'oldpeak', 'trestbps', 'exang', 'slope', 'age', 'thalach', 'sex'],
}



In [9]:
def predict_all(df, models, scalers, features_dict, input_columns, heuristics=[1] * 8):
    final_predictions = []

    for _, row in df.iterrows():
        full_row = row[input_columns].to_frame().T

        scaled = pd.DataFrame(
            scalers["binary"].transform(full_row), columns=input_columns
        )
        bin_pred = models["binary"].predict_proba(scaled[features_dict["binary"]])[0]

        if bin_pred[0] * heuristics[0] > bin_pred[1] * heuristics[1]:
            final_predictions.append(0)
        else:
            # 14 vs 23
            scaled = pd.DataFrame(scalers["14_23"].transform(full_row), columns=input_columns)
            pred_14_23 = models["14_23"].predict_proba(scaled[features_dict["14_23"]])[0]

            if pred_14_23[0] * heuristics[2] > pred_14_23[1] * heuristics[3]:

                scaled = pd.DataFrame(scalers["1_4"].transform(full_row), columns=input_columns)
                pred_1_4 = models["1_4"].predict_proba(scaled[features_dict["1_4"]])[0]
                final_predictions.append(1 if pred_1_4[0] * heuristics[4] > pred_1_4[1] * heuristics[5] else 4)
            else:
                # 2 vs 3
                scaled = pd.DataFrame(
                    scalers["2_3"].transform(full_row), columns=input_columns
                )
                pred_2_3 = models["2_3"].predict_proba(scaled[features_dict["2_3"]])[0]
                final_predictions.append(2 if pred_2_3[0] * heuristics[6]> pred_2_3[1] * heuristics[7] else 3)
    
    report = classification_report(true_labels, final_predictions, digits=6)
    print(report)

In [10]:
heuristics_precision = [0.89 , 0.96 , 0.87 , 0.92 ,1.0 , 1.0, 0.86 , 0.86]
heuristics_recall = [0.97 , 0.86 , 0.93 , 0.86 ,1.0 , 1.0, 0.86 , 0.86]
heuristics_f1 = [0.93 , 0.91 , 0.90 , 0.89 ,1.0 , 1.0, 0.86 , 0.86]

heuristics_multiply = [0.802869, 0.751296, 0.728190, 0.704168, 1.0, 1.0, 0.636056, 0.636056]
print("plain")
predict_all(df, smote_models, smote_scalers, smote_features_dict, input_columns)
print("<=========================================================================>")
print("heuristics precision")
predict_all(df, smote_models, smote_scalers, smote_features_dict, input_columns,heuristics_precision)
print("<=========================================================================>")
print("heuristics recall")
predict_all(df, smote_models, smote_scalers, smote_features_dict, input_columns,heuristics_recall)
print("<=========================================================================>")
print("heuristics f1")
predict_all(df, smote_models, smote_scalers, smote_features_dict, input_columns,heuristics_f1)
print("<=========================================================================>")
print("heuristics all")
predict_all(df, smote_models, smote_scalers, smote_features_dict, input_columns, heuristics_multiply)


plain
              precision    recall  f1-score   support

           0   0.852071  0.900000  0.875380       160
           1   0.588235  0.555556  0.571429        54
           2   0.625000  0.571429  0.597015        35
           3   0.677419  0.600000  0.636364        35
           4   0.642857  0.692308  0.666667        13

    accuracy                       0.754209       297
   macro avg   0.677117  0.663858  0.669371       297
weighted avg   0.747602  0.754209  0.750010       297

heuristics precision
              precision    recall  f1-score   support

           0   0.852071  0.900000  0.875380       160
           1   0.617021  0.537037  0.574257        54
           2   0.600000  0.600000  0.600000        35
           3   0.687500  0.628571  0.656716        35
           4   0.642857  0.692308  0.666667        13

    accuracy                       0.757576       297
   macro avg   0.679890  0.671583  0.674604       297
weighted avg   0.751078  0.757576  0.753274       

In [11]:
heuristics_precision =[0.937500 , 0.928571 , 0.812500 , 0.916667 , 1.0 , 1.0 , 0.857143 , 0.857143]
heuristics_recall =[0.937500 , 0.928571 , 0.928571 , 0.785714 , 1.0 , 1.0 , 0.857143 , 0.857143]
heuristics_f1 =[0.937500 , 0.928571 , 0.866667 , 0.846154 , 1.0 , 1.0 , 0.857143 , 0.857143]
heuristics_multiply = [0.823975, 0.800655 , 0.653869, 0.609432, 1.0, 1.0, 0.629738, 0.629738]

print("plain")
predict_all(df, adasyn_models, adasyn_scalers, adasyn_features_dict, input_columns)
print("<=========================================================================>")
print("heuristics precision")
predict_all(df, adasyn_models, adasyn_scalers, adasyn_features_dict, input_columns,heuristics_precision)
print("<=========================================================================>")
print("heuristics recall")
predict_all(df, adasyn_models, adasyn_scalers, adasyn_features_dict, input_columns,heuristics_recall)
print("<=========================================================================>")
print("heuristics f1")
predict_all(df, adasyn_models, adasyn_scalers, adasyn_features_dict, input_columns,heuristics_f1)
print("<=========================================================================>")
print("heuristics all")
predict_all(df, adasyn_models, adasyn_scalers, adasyn_features_dict, input_columns, heuristics_multiply)


plain
              precision    recall  f1-score   support

           0   0.857143  0.975000  0.912281       160
           1   0.789474  0.277778  0.410959        54
           2   0.571429  0.685714  0.623377        35
           3   0.608696  0.800000  0.691358        35
           4   0.625000  0.384615  0.476190        13

    accuracy                       0.767677       297
   macro avg   0.690348  0.624621  0.622833       297
weighted avg   0.771730  0.767677  0.741963       297

heuristics precision
              precision    recall  f1-score   support

           0   0.857143  0.975000  0.912281       160
           1   0.833333  0.277778  0.416667        54
           2   0.581395  0.714286  0.641026        35
           3   0.583333  0.800000  0.674699        35
           4   0.500000  0.230769  0.315789        13

    accuracy                       0.764310       297
   macro avg   0.671041  0.599567  0.592092       297
weighted avg   0.772419  0.764310  0.736096       

In [12]:
heuristics_precision =[0.909091 , 0.925926 , 0.923077 , 0.866667 , 1.0 , 1.0 , 1.0 , 0.777778]
heuristics_recall =[0.937500 , 0.892857 , 0.857143 , 0.928571 , 1.0 , 1.0 , 0.714286 , 1.0]
heuristics_f1 =[0.923077  , 0.909091  , 0.888889  , 0.896552 , 1.0 , 1.0 , 0.833333 , 0.875000 ]
heuristics_multiply = [0.786713, 0.751563, 0.703297, 0.721511 , 1.0, 1.0, 0.595238, 0.680556]


print("plain")
predict_all(df, smote_tomek_models, smote_tomek_scalers, smote_tomek_features_dict, input_columns)
print("<=========================================================================>")
print("heuristics precision")
predict_all(df, smote_tomek_models, smote_tomek_scalers, smote_tomek_features_dict, input_columns,heuristics_precision)
print("<=========================================================================>")
print("heuristics recall")
predict_all(df, smote_tomek_models, smote_tomek_scalers, smote_tomek_features_dict, input_columns,heuristics_recall)
print("<=========================================================================>")
print("heuristics f1")
predict_all(df, smote_tomek_models, smote_tomek_scalers, smote_tomek_features_dict, input_columns,heuristics_f1)
print("<=========================================================================>")
print("heuristics all")
predict_all(df, smote_tomek_models, smote_tomek_scalers, smote_tomek_features_dict, input_columns, heuristics_multiply)


plain
              precision    recall  f1-score   support

           0   0.824561  0.881250  0.851964       160
           1   0.630435  0.537037  0.580000        54
           2   0.758621  0.628571  0.687500        35
           3   0.641026  0.714286  0.675676        35
           4   0.916667  0.846154  0.880000        13

    accuracy                       0.767677       297
   macro avg   0.754262  0.721460  0.735028       297
weighted avg   0.763898  0.767677  0.763587       297

heuristics precision
              precision    recall  f1-score   support

           0   0.829412  0.881250  0.854545       160
           1   0.645833  0.574074  0.607843        54
           2   0.685714  0.685714  0.685714        35
           3   0.677419  0.600000  0.636364        35
           4   0.846154  0.846154  0.846154        13

    accuracy                       0.767677       297
   macro avg   0.736907  0.717438  0.726124       297
weighted avg   0.761921  0.767677  0.763716       

In [13]:
heuristics_precision =[0.885714 , 0.960000 , 0.916667 , 0.812500  , 1.0 , 1.0 , 1.0 , 0.777778]
heuristics_recall =[0.968750 , 0.857143 , 0.785714 , 0.928571  , 1.0 , 1.0 , 0.714286 , 1.0]
heuristics_f1 =[0.925373  , 0.905660   , 0.846154   , 0.866667 , 1.0 , 1.0 , 0.833333 , 0.875000 ]
heuristics_multiply = [0.794003, 0.745229, 0.609432, 0.653869, 1.0, 1.0, 0.595238, 0.680556]

print("plain")
predict_all(df, kmeans_smote_models, kmeans_smote_scalers, kmeans_smote_features_dict, input_columns)
print("<=========================================================================>")
print("heuristics precision")
predict_all(df, kmeans_smote_models, kmeans_smote_scalers, kmeans_smote_features_dict, input_columns,heuristics_precision)
print("<=========================================================================>")
print("heuristics recall")
predict_all(df, kmeans_smote_models, kmeans_smote_scalers, kmeans_smote_features_dict, input_columns,heuristics_recall)
print("<=========================================================================>")
print("heuristics f1")
predict_all(df, kmeans_smote_models, kmeans_smote_scalers, kmeans_smote_features_dict, input_columns,heuristics_f1)
print("<=========================================================================>")
print("heuristics all")
predict_all(df, kmeans_smote_models, kmeans_smote_scalers, kmeans_smote_features_dict, input_columns, heuristics_multiply)


plain
              precision    recall  f1-score   support

           0   0.868263  0.906250  0.886850       160
           1   0.520833  0.462963  0.490196        54
           2   0.475000  0.542857  0.506667        35
           3   0.500000  0.485714  0.492754        35
           4   0.875000  0.538462  0.666667        13

    accuracy                       0.717172       297
   macro avg   0.647819  0.587249  0.608627       297
weighted avg   0.715647  0.717172  0.713848       297

heuristics precision
              precision    recall  f1-score   support

           0   0.888889  0.900000  0.894410       160
           1   0.476190  0.555556  0.512821        54
           2   0.500000  0.571429  0.533333        35
           3   0.500000  0.342857  0.406780        35
           4   0.875000  0.538462  0.666667        13

    accuracy                       0.717172       297
   macro avg   0.648016  0.581661  0.602802       297
weighted avg   0.721588  0.717172  0.715046       

In [14]:
heuristics_precision =[0.86 , 0.960000 , 0.86 , 0.86  , 0.92 , 1.0 , 0.78 , 1.0]
heuristics_recall =[0.97 , 0.82 , 0.86 , 0.96  , 1.0 , 0.67 , 1.0 , 0.71]
heuristics_f1 =[0.91  , 0.88   , 0.86   , 0.86 , 0.96 , 0.8 , 0.88 , 0.83 ]
heuristics_multiply = [0.7591220000000001, 0.6927359999999999, 0.636056, 0.710016, 0.8832, 0.536, 0.6864, 0.5892999999999999]


print("plain")
predict_all(df, raw_models, raw_scalers, raw_features_dict, input_columns)
print("<=========================================================================>")
print("heuristics precision")
predict_all(df, raw_models, raw_scalers, raw_features_dict, input_columns,heuristics_precision)
print("<=========================================================================>")
print("heuristics recall")
predict_all(df, raw_models, raw_scalers, raw_features_dict, input_columns,heuristics_recall)
print("<=========================================================================>")
print("heuristics f1")
predict_all(df, raw_models, raw_scalers, raw_features_dict, input_columns,heuristics_f1)
print("<=========================================================================>")
print("heuristics all")
predict_all(df, raw_models, raw_scalers, raw_features_dict, input_columns, heuristics_multiply)


plain
              precision    recall  f1-score   support

           0   0.894737  0.956250  0.924471       160
           1   0.564103  0.407407  0.473118        54
           2   0.406250  0.371429  0.388060        35
           3   0.400000  0.457143  0.426667        35
           4   0.333333  0.384615  0.357143        13

    accuracy                       0.703704       297
   macro avg   0.519685  0.515369  0.513892       297
weighted avg   0.694180  0.703704  0.695697       297

heuristics precision
              precision    recall  f1-score   support

           0   0.898204  0.937500  0.917431       160
           1   0.560976  0.425926  0.484211        54
           2   0.384615  0.285714  0.327869        35
           3   0.382979  0.514286  0.439024        35
           4   0.312500  0.384615  0.344828        13

    accuracy                       0.693603       297
   macro avg   0.507855  0.509608  0.502673       297
weighted avg   0.690012  0.693603  0.687745       

In [15]:
result = [a * b * c for a, b, c in zip(heuristics_f1, heuristics_recall, heuristics_precision)]
print(result)

[0.7591220000000001, 0.6927359999999999, 0.636056, 0.710016, 0.8832, 0.536, 0.6864, 0.5892999999999999]
