# Import dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import the models

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

Loading the dataset

In [3]:
heart_failure_data = pd.read_csv('E:\dev\ML\IBM\Data science\Data\Predicting heart failure\patientdata.csv')
heart_failure_data

Unnamed: 0,AVGHEARTBEATSPERMIN,PALPITATIONSPERDAY,CHOLESTEROL,BMI,AGE,SEX,FAMILYHISTORY,SMOKERLAST5YRS,EXERCISEMINPERWEEK,HEARTFAILURE
0,93,22,163,25,49,F,N,N,110,N
1,108,22,181,24,32,F,N,N,192,N
2,86,0,239,20,60,F,N,N,121,N
3,80,36,164,31,45,F,Y,N,141,Y
4,66,36,185,23,39,F,N,N,63,N
...,...,...,...,...,...,...,...,...,...,...
10795,122,27,203,30,31,F,N,N,246,N
10796,67,15,186,25,54,F,N,N,189,N
10797,103,6,211,20,40,F,N,N,205,N
10798,99,30,152,27,32,M,N,N,211,N


In [4]:
heart_failure_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10800 entries, 0 to 10799
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   AVGHEARTBEATSPERMIN  10800 non-null  int64 
 1   PALPITATIONSPERDAY   10800 non-null  int64 
 2   CHOLESTEROL          10800 non-null  int64 
 3   BMI                  10800 non-null  int64 
 4   AGE                  10800 non-null  int64 
 5   SEX                  10800 non-null  object
 6   FAMILYHISTORY        10800 non-null  object
 7   SMOKERLAST5YRS       10800 non-null  object
 8   EXERCISEMINPERWEEK   10800 non-null  int64 
 9   HEARTFAILURE         10800 non-null  object
dtypes: int64(6), object(4)
memory usage: 843.9+ KB


In [5]:
heart_failure_data['HEARTFAILURE'].value_counts()

N    9012
Y    1788
Name: HEARTFAILURE, dtype: int64

In [6]:
heart_failure_data['SEX'].replace({'M':1, 'F':0}, inplace=True)
heart_failure_data['FAMILYHISTORY'].replace({'Y':1, 'N':0}, inplace=True)
heart_failure_data['SMOKERLAST5YRS'].replace({'Y':1, 'N':0}, inplace=True)                                        
heart_failure_data.head()

Unnamed: 0,AVGHEARTBEATSPERMIN,PALPITATIONSPERDAY,CHOLESTEROL,BMI,AGE,SEX,FAMILYHISTORY,SMOKERLAST5YRS,EXERCISEMINPERWEEK,HEARTFAILURE
0,93,22,163,25,49,0,0,0,110,N
1,108,22,181,24,32,0,0,0,192,N
2,86,0,239,20,60,0,0,0,121,N
3,80,36,164,31,45,0,1,0,141,Y
4,66,36,185,23,39,0,0,0,63,N


In [7]:
X = heart_failure_data.drop(columns = 'HEARTFAILURE', axis = 1)
Y = heart_failure_data['HEARTFAILURE']

In [8]:
#convert X and Y into np array
X = np.asarray(X)
Y = np.asarray(Y)

In [9]:
X

array([[ 93,  22, 163, ...,   0,   0, 110],
       [108,  22, 181, ...,   0,   0, 192],
       [ 86,   0, 239, ...,   0,   0, 121],
       ...,
       [103,   6, 211, ...,   0,   0, 205],
       [ 99,  30, 152, ...,   0,   0, 211],
       [ 50,  31, 232, ...,   0,   0,  54]], dtype=int64)

In [10]:
Y

array(['N', 'N', 'N', ..., 'N', 'N', 'N'], dtype=object)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [12]:
models = [LogisticRegression(max_iter=1000),
         SVC(kernel = 'linear'),
         KNeighborsClassifier(),
         RandomForestClassifier()
         ]

In [13]:
def compare_models_with_train_test_split():
    for model in models:
        model.fit(X_train, Y_train)
        test_data_predict = model.predict(X_test)
        accuracy = accuracy_score(Y_test, test_data_predict)
        print('Accuracy score of ', model , '=', accuracy)

In [14]:
compare_models_with_train_test_split()

Accuracy score of  LogisticRegression(max_iter=1000) = 0.8583333333333333
Accuracy score of  SVC(kernel='linear') = 0.8388888888888889
Accuracy score of  KNeighborsClassifier() = 0.8296296296296296
Accuracy score of  RandomForestClassifier() = 0.8768518518518519


In [15]:
def compare_models_with_cross_validation():
    for model in models:
        cv_score = cross_val_score(model, X, Y, cv=5)
        mean_accuracy = sum(cv_score)/ len(cv_score)
        mean_accuracy = mean_accuracy * 100
        mean_accuracy = round(mean_accuracy, 2)
        print('Accuracy for', model, '=', cv_score)
        print('Mean % accuracy for ', model, '=', mean_accuracy)
        print('---------------------------------------------------------')

In [16]:
compare_models_with_cross_validation()

Accuracy for LogisticRegression(max_iter=1000) = [0.77962963 0.88611111 0.8787037  0.87268519 0.85046296]
Mean % accuracy for  LogisticRegression(max_iter=1000) = 85.35
---------------------------------------------------------
Accuracy for SVC(kernel='linear') = [0.77777778 0.83472222 0.83425926 0.83425926 0.83425926]
Mean % accuracy for  SVC(kernel='linear') = 82.31
---------------------------------------------------------
Accuracy for KNeighborsClassifier() = [0.79953704 0.83888889 0.83194444 0.83194444 0.83287037]
Mean % accuracy for  KNeighborsClassifier() = 82.7
---------------------------------------------------------
Accuracy for RandomForestClassifier() = [0.7625     0.90972222 0.9        0.89675926 0.86574074]
Mean % accuracy for  RandomForestClassifier() = 86.69
---------------------------------------------------------
