In [43]:
# Setup
import numpy as np 
import pandas as pd 
import os
import pickle

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images/ml_modeling_images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

def save_model(model, folder_path="models", file_name="untitled_model.sav"):
    pickle.dump(model, open(os.path.join(folder_path, file_name), 'wb'))

HEARTDISEASE_PATH = "dataset/afterpreprocessing"

heartdisease_data = pd.read_csv(os.path.join(HEARTDISEASE_PATH, f'heartDisease.csv'))
heartdisease_data.head()

Unnamed: 0,heartDisease,BMI,smoking,alcoholDrinking,stroke,physicalHealth,mentalHealth,diffWalking,sex,ageCategory,race,diabetic,physicalActivity,genHealth,sleepTime,asthma,kidneyDisease,skinCancer
0,2.0,1660.0,1.0,1,2.0,3.0,30.0,2.0,2,8,1,1.0,1,2.0,5.0,1.0,2.0,1.0
1,2.0,2034.0,2.0,1,1.0,88.0,88.0,2.0,2,13,1,3.0,1,2.0,7.0,2.0,2.0,2.0
2,2.0,2658.0,1.0,1,2.0,20.0,30.0,2.0,1,10,1,1.0,1,4.0,8.0,1.0,2.0,2.0
3,2.0,2421.0,2.0,1,2.0,88.0,88.0,2.0,2,12,1,3.0,2,3.0,6.0,2.0,2.0,1.0
4,2.0,2657.0,1.0,9,2.0,15.0,10.0,2.0,2,10,2,1.0,1,4.0,6.0,2.0,2.0,2.0


In [44]:
from sklearn.model_selection import train_test_split

features = heartdisease_data.drop(columns =['heartDisease'], axis = 1)

target = heartdisease_data['heartDisease']

x_train, x_test, y_train, y_test = train_test_split(features, target, shuffle = True, test_size = 0.3, random_state = 44)

x_train, y_train

(           BMI  smoking  alcoholDrinking  stroke  physicalHealth   
 293404  3515.0      1.0                1     2.0             2.0  \
 263990  3712.0      2.0                1     2.0            88.0   
 251372  3193.0      1.0                1     2.0            88.0   
 156314  1958.0      2.0                1     2.0             3.0   
 324366  3328.0      2.0                1     2.0             1.0   
 ...        ...      ...              ...     ...             ...   
 49723   4288.0      2.0                1     2.0             1.0   
 156845  2658.0      1.0                1     2.0            88.0   
 256753  3109.0      2.0                1     2.0            88.0   
 200099  1967.0      2.0                1     2.0             1.0   
 14100   3087.0      2.0                1     2.0             3.0   
 
         mentalHealth  diffWalking  sex  ageCategory  race  diabetic   
 293404          88.0          2.0    1           11     1       3.0  \
 263990           1.0     

In [45]:
# Load models
MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models")

DECISION_TREE_PATH = os.path.join(MODELS_PATH, 'decision_tree')
KNN_PATH = os.path.join(MODELS_PATH, 'knn')
LOGREG_PATH = os.path.join(MODELS_PATH, 'log_regression')
KMEANS_PATH = os.path.join(MODELS_PATH, 'kmeans')
RANDOMFOREST_PATH = os.path.join(MODELS_PATH, 'randomforest')

full_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "full_decision_tree_model.sav"), 'rb'))
reduced_decisiontree_model = pickle.load(open(os.path.join(DECISION_TREE_PATH, "reduced_decision_tree_model.sav"), 'rb'))
reduced_kmeans_model = pickle.load(open(os.path.join(KMEANS_PATH, "reduced_kmeans_model.sav"), 'rb'))
full_knn_model = pickle.load(open(os.path.join(KNN_PATH, "full_knn_model.sav"), 'rb'))
full_logregression_model = pickle.load(open(os.path.join(LOGREG_PATH, "full_logregression_model.sav"), 'rb'))
reduced_randomforest_model = pickle.load(open(os.path.join(RANDOMFOREST_PATH, "reduced_randomforest_model.sav"), 'rb'))

full_models_names = ["Full Decision Tree", "Full KNN", "Full Logistic Regression"]
reduced_models_names = ["Reduced Decision Tree", "Reduced KMeans", "Reduced Random Forest"]
full_models = [full_decisiontree_model, full_knn_model, full_logregression_model]
reduced_models = [reduced_decisiontree_model, reduced_kmeans_model, reduced_randomforest_model]

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, mean_absolute_error

# Evaluate model
def evaluate_model(model, x_test, y_test):
    y_prediction = model.predict(x_test)
    print(f"Number of Positive Predictions (Heart Disease): {np.sum(y_prediction == 1.0)}\nNumber of Negative Predictions (No Heart Disease): {np.sum(y_prediction == 2.0)}")
    # Calc accuracy, precision, recall, f1-score
    accuracy = accuracy_score(y_test, y_prediction)
    precision = precision_score(y_test, y_prediction)
    recall = recall_score(y_test, y_prediction)
    f1 = f1_score(y_test, y_prediction)

    # Calc AUC
    y_prediction_prob = model.predict_proba(x_test)[::, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prediction_prob, pos_label={1.0, 2.0})
    auc = roc_auc_score(y_test, y_prediction_prob)

    # Confusion Matrix
    matrix = confusion_matrix(y_test, y_prediction)

    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_prediction)
    return {"Accuracy" : accuracy, "Precision" : precision, "Recall" : recall, "F1" : f1, 
            "False Positive Rate" : fpr, "True Postitive Rate" : tpr, "Area Under Curve" : auc,
            "MAE" : mae}, matrix


In [52]:
for i in range(len(full_models)):
    print(full_models_names[i], ":\n")
    metrics, matrix= evaluate_model(full_models[i], x_test, y_test)
    metrics_df = pd.DataFrame(metrics.items())
    print(metrics_df, "\n", matrix, "\n")

Full Decision Tree :

Number of Positive Predictions (Heart Disease): 0
Number of Negative Predictions (No Heart Disease): 105660
                     0                                                  1
0             Accuracy                                           0.911651
1            Precision                                                0.0
2               Recall                                                0.0
3                   F1                                                0.0
4  False Positive Rate  [0.0, 0.5555555555555556, 0.6026026878667424, ...
5  True Postitive Rate                          [nan, nan, nan, nan, nan]
6     Area Under Curve                                           0.752767
7                  MAE                                           0.088349 
 [[    0  9335]
 [    0 96325]] 

Full KNN :



  _warn_prf(average, modifier, msg_start, len(result))


Number of Positive Predictions (Heart Disease): 1880
Number of Negative Predictions (No Heart Disease): 103780




                     0                                                  1
0             Accuracy                                           0.903171
1            Precision                                           0.261702
2               Recall                                           0.052705
3                   F1                                            0.08774
4  False Positive Rate  [0.0, 0.7068616316486844, 0.9168654173764906, ...
5  True Postitive Rate                [nan, nan, nan, nan, nan, nan, nan]
6     Area Under Curve                                           0.651885
7                  MAE                                           0.096829 
 [[  492  8843]
 [ 1388 94937]] 

Full Logistic Regression :

Number of Positive Predictions (Heart Disease): 1481
Number of Negative Predictions (No Heart Disease): 104179
                     0                                                  1
0             Accuracy                                           0.911963
1           