In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("Processed_heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0


In [4]:
X=df[['Age','Sex','ChestPainType','Cholesterol','MaxHR','RestingBP','FastingBS','RestingECG','ExerciseAngina', 'Oldpeak', 'ST_Slope']]
Y=df['HeartDisease']

In [5]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_std=sc.fit_transform(X)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, Y, test_size=0.30, random_state=0)

# Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold,cross_val_score,GridSearchCV

DT = DecisionTreeClassifier(random_state=0)

params={'max_depth':np.arange(2,10),'criterion':['entropy','gini']}
kf=KFold(n_splits=5,shuffle=True,random_state=0)
GS=GridSearchCV(DT,params,cv=kf,scoring='f1_weighted')
GS.fit(X_std,Y)

GridSearchCV(cv=KFold(n_splits=5, random_state=0, shuffle=True),
             estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9])},
             scoring='f1_weighted')

In [8]:
GS.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)

In [9]:
dt_reg=DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0)
dt_reg.fit(X_train,y_train)

y_train_pred = dt_reg.predict(X_train)
y_test_pred = dt_reg.predict(X_test)

from sklearn import metrics

acc_train=metrics.accuracy_score(y_train,y_train_pred)
print("Train Accuracy : ",acc_train*100)
acc_test=metrics.accuracy_score(y_test,y_test_pred)
print("Test Accuracy : ",acc_test*100)

from sklearn.model_selection import KFold,cross_val_score,GridSearchCV

#validate model performance using K-fold Cross Validation using roc scoring
kf=KFold(n_splits=5,shuffle=True,random_state=0)
scores=cross_val_score(dt_reg,X_std,Y,cv=kf,scoring='f1_weighted')

print("Bias Error : ",np.mean(scores))

print("Varience : ",np.std(scores, ddof= 1))

Train Accuracy :  89.8753894080997
Test Accuracy :  81.15942028985508
Bias Error :  0.8440378571748874
Varience :  0.03962329984900019


# Random Forest with RandomSearchCV

In [10]:
n_estimators = range(3,200,5) # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
RF=RandomForestClassifier(random_state=0)
rf_random = RandomizedSearchCV(estimator = RF,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=0, n_jobs = -1)

In [12]:
rf_random.fit(X_std, Y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': range(3, 200, 5)},
                   random_state=0, verbose=2)

In [13]:
print('Random grid: ', random_grid, '\n')
# print the best parameters
print('Best Parameters: ', rf_random.best_params_, ' \n')
print('Best Score: ', rf_random.best_score_, ' \n')
print('Best Estimatore : ', rf_random.best_estimator_ ,'\n')

Random grid:  {'n_estimators': range(3, 200, 5), 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 113, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 120, 'bootstrap': True}  

Best Score:  0.83976003801378  

Best Estimatore :  RandomForestClassifier(max_depth=120, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=113, random_state=0) 



In [14]:
rf_reg = RandomForestClassifier(max_depth=120, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=113, random_state=0) 

rf_reg.fit(X_train,y_train)

y_train_pred = rf_reg.predict(X_train)
y_test_pred = rf_reg.predict(X_test)


from sklearn import metrics

acc_train=metrics.accuracy_score(y_train,y_train_pred)
print("Train Accuracy : ",acc_train*100)
acc_test=metrics.accuracy_score(y_test,y_test_pred)
print("Test Accuracy : ",acc_test*100)

from sklearn.model_selection import KFold,cross_val_score,GridSearchCV

#validate model performance using K-fold Cross Validation using roc scoring
kf=KFold(n_splits=5,shuffle=True,random_state=0)
scores=cross_val_score(rf_reg,X_std,Y,cv=kf,scoring='f1_weighted')

print("Bias Error : ",np.mean(scores))

print("Varience : ",np.std(scores, ddof= 1))

Train Accuracy :  91.74454828660437
Test Accuracy :  85.14492753623189
Bias Error :  0.8731077556083943
Varience :  0.0186854168611879


# Random Forest with GridSearch CV

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
RF=RandomForestClassifier(random_state=0)
CV_rfc = GridSearchCV(estimator=RF, param_grid=random_grid, cv= 5)
CV_rfc.fit(X_std, Y)

In [None]:
print('Random grid: ', random_grid, '\n')
# print the best parameters
print('Best Parameters: ', CV_rfc.best_params_, ' \n')
print('Best Score: ', CV_rfc.best_score_, ' \n')
print('Best Estimatore : ', CV_rfc.best_estimator_ ,'\n')

In [None]:
rf_reg = RandomForestClassifier(max_depth=120, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=113, random_state=0) 

rf_reg.fit(X_train,y_train)

y_train_pred = rf_reg.predict(X_train)
y_test_pred = rf_reg.predict(X_test)


from sklearn import metrics

acc_train=metrics.accuracy_score(y_train,y_train_pred)
print("Train Accuracy : ",acc_train*100)
acc_test=metrics.accuracy_score(y_test,y_test_pred)
print("Test Accuracy : ",acc_test*100)

from sklearn.model_selection import KFold,cross_val_score,GridSearchCV

#validate model performance using K-fold Cross Validation using roc scoring
kf=KFold(n_splits=5,shuffle=True,random_state=0)
scores=cross_val_score(rf_reg,X_std,Y,cv=kf,scoring='f1_weighted')

print("Bias Error : ",np.mean(scores))

print("Varience : ",np.std(scores, ddof= 1))

# AdaBoosting with DecisionTree

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(random_state=0)
params={'n_estimators':np.arange(10,250), 'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0]}
kf=KFold(n_splits=5,shuffle=True,random_state=0)
GS=GridSearchCV(ada,params,cv=kf,scoring='f1_weighted')
GS.fit(X_std,Y)

GridSearchCV(cv=KFold(n_splits=5, random_state=0, shuffle=True),
             estimator=AdaBoostClassifier(random_state=0),
             param_grid={'n_estimators': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
       70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
       87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])},
             scoring='f1_weighted')

In [66]:
GS.best_score_

0.8657987879450927

# AdaBoosting with GridSearch Tunning

In [None]:
model = AdaBoostClassifier()
# define the grid of values to search
ada_grid = dict()
ada_grid['n_estimators'] = [10, 50, 100, 500]
ada_grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
ada_grid_search = GridSearchCV(estimator=model, param_grid=ada_grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
ada_grid_search.fit(X, y)

In [None]:
print('Random grid: ', ada_grid, '\n')
# print the best parameters
print('Best Parameters: ', ada_grid_search.best_params_, ' \n')
print('Best Score: ', ada_grid_search.best_score_, ' \n')
print('Best Estimatore : ', ada_grid_search.best_estimator_ ,'\n')

ada_reg = RandomForestClassifier(max_depth=120, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=113, random_state=0) 

ada_reg.fit(X_train,y_train)

y_train_pred = ada_reg.predict(X_train)
y_test_pred = ada_reg.predict(X_test)


from sklearn import metrics

acc_train=metrics.accuracy_score(y_train,y_train_pred)
print("Train Accuracy : ",acc_train*100)
acc_test=metrics.accuracy_score(y_test,y_test_pred)
print("Test Accuracy : ",acc_test*100)

from sklearn.model_selection import KFold,cross_val_score,GridSearchCV

#validate model performance using K-fold Cross Validation using roc scoring
kf=KFold(n_splits=5,shuffle=True,random_state=0)
scores=cross_val_score(ada_reg,X_std,Y,cv=kf,scoring='f1_weighted')

print("Bias Error : ",np.mean(scores))

print("Varience : ",np.std(scores, ddof= 1))

# AdaBoosting with SVC

In [None]:
from sklearn.ensemble import AdaBoostClassifier
# Import Support Vector Classifier
from sklearn.svm import SVC
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
svc=SVC(probability=True, kernel='linear')

# Create adaboost classifer object
abc =AdaBoostClassifier(n_estimators=50, base_estimator=svc,learning_rate=1)
# Train Adaboost Classifer
abc.fit(X_train, y_train)

y_train_pred = abc.predict(X_train)
y_test_pred = abc.predict(X_test)

from sklearn import metrics

acc_train=metrics.accuracy_score(y_train,y_train_pred)
print("Train Accuracy : ",acc_train*100)
acc_test=metrics.accuracy_score(y_test,y_test_pred)
print("Test Accuracy : ",acc_test*100)

from sklearn.model_selection import KFold,cross_val_score,GridSearchCV

#validate model performance using K-fold Cross Validation using roc scoring
kf=KFold(n_splits=5,shuffle=True,random_state=0)
scores=cross_val_score(rf_reg,X_std,Y,cv=kf,scoring='f1_weighted')

print("Bias Error : ",np.mean(scores))

print("Varience : ",np.std(scores, ddof= 1))