In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("Processed_heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0


In [4]:
import sklearn

In [5]:
def remove_outliner(df,col):
    Q1 = np.percentile(df[col], 25,
                           interpolation = 'midpoint')

    Q3 = np.percentile(df[col], 75,
                           interpolation = 'midpoint')
    IQR = Q3 - Q1

    print(f"{col} Old Shape: ", df.shape)

    # Upper bound
    upper = np.where(df[col] >= (Q3+1.5*IQR))
    # Lower bound
    lower = np.where(df[col] <= (Q1-1.5*IQR))

    ''' Removing the Outliers '''
    df.drop(upper[0], inplace = True)
    df.drop(lower[0], inplace = True)

    print(f"{col} New Shape: ", df.shape)
    
    return df

In [6]:
df = remove_outliner(df,'Cholesterol')

Cholesterol Old Shape:  (918, 12)
Cholesterol New Shape:  (735, 12)


In [7]:
df = df.reset_index(drop=True)
df = remove_outliner(df,'RestingBP')

RestingBP Old Shape:  (735, 12)
RestingBP New Shape:  (703, 12)


In [8]:
df = df.reset_index(drop=True)
df = remove_outliner(df,'MaxHR')

MaxHR Old Shape:  (703, 12)
MaxHR New Shape:  (703, 12)


In [9]:
df.to_csv("process_outlier_data.csv",header=True,index=False)

In [10]:
df = pd.read_csv("process_outlier_data.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import KFold,cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold


def train_model_with_tunning(model,param, X_std, Y):
    
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X_std, Y, test_size=0.30, random_state=0)
    reg_random_cv = RandomizedSearchCV(estimator = model,param_distributions = param,
               n_iter = 100, cv = cv, verbose=2, random_state=0, n_jobs = -1,scoring='f1_weighted')
    reg_random_cv.fit(X_std, Y)
    
    print('Random grid: ', reg_random_cv, '\n')
    print('Best Parameters: ', reg_random_cv.best_params_, ' \n')
    print('Best Score: ', reg_random_cv.best_score_, ' \n')
    print('Best Estimatore : ', reg_random_cv.best_estimator_ ,'\n')
    
    reg_model = reg_random_cv.best_estimator_ 

    reg_model.fit(X_train,y_train)

    y_train_pred = reg_model.predict(X_train)
    y_test_pred = reg_model.predict(X_test)
    
    acc_train=metrics.accuracy_score(y_train,y_train_pred)
    print("Train Accuracy : ",acc_train*100)
    acc_test=metrics.accuracy_score(y_test,y_test_pred)
    print("Test Accuracy : ",acc_test*100)


    #validate model performance using K-fold Cross Validation using roc scoring
    kf=KFold(n_splits=5,shuffle=True,random_state=0)
    scores=cross_val_score(reg_model,X_std,Y,cv=cv,scoring='f1_weighted')
    print("Bias Error : ",np.mean(scores))

    print("Varience : ",np.std(scores, ddof= 1))

In [12]:
X=df[['Age','Sex','ChestPainType','Cholesterol','MaxHR','RestingBP','FastingBS','RestingECG','ExerciseAngina', 'Oldpeak', 'ST_Slope']]
Y=df['HeartDisease']

In [13]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_std=sc.fit_transform(X)

In [14]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=0)
DT_params={'max_depth':np.arange(2,10),
        'criterion':['entropy','gini']}

In [15]:
from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier(random_state=0)

RF_params = {'n_estimators': range(3,200,5), # number of trees in the random forest
'max_features': ['auto', 'sqrt'], # number of features in consideration at every split
'max_depth': [int(x) for x in np.linspace(10, 120, num = 12)], # maximum number of levels allowed in each decision tree
'min_samples_split': [2, 6, 10], # minimum sample number to split a node
'min_samples_leaf': [1, 3, 4], # minimum sample number that can be stored in a leaf node
'bootstrap': [True, False] }# method used to sample data points

In [16]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(random_state=0)
ada_params={'n_estimators':np.arange(10,250), 'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0]}

In [17]:
train_model_with_tunning(model=DT,param=DT_params, X_std=X_std, Y=Y)

Fitting 30 folds for each of 16 candidates, totalling 480 fits




Random grid:  RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                   estimator=DecisionTreeClassifier(random_state=0), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': array([2, 3, 4, 5, 6, 7, 8, 9])},
                   random_state=0, verbose=2) 

Best Parameters:  {'max_depth': 3, 'criterion': 'entropy'}  

Best Score:  0.8321327967806841  

Best Estimatore :  DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0) 

Train Accuracy :  85.97560975609755
Test Accuracy :  83.88625592417061
Bias Error :  0.8317302589859347
Varience :  0.03782710725306688


In [18]:
train_model_with_tunning(model=RF,param=RF_params, X_std=X_std, Y=Y)

Fitting 30 folds for each of 100 candidates, totalling 3000 fits
Random grid:  RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                   estimator=RandomForestClassifier(random_state=0), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': range(3, 200, 5)},
                   random_state=0, verbose=2) 

Best Parameters:  {'n_estimators': 63, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_featur

In [19]:
train_model_with_tunning(model=ada,param=ada_params, X_std=X_std, Y=Y)

Fitting 30 folds for each of 100 candidates, totalling 3000 fits
Random grid:  RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1),
                   estimator=AdaBoostClassifier(random_state=0), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.0001, 0.001, 0.01,
                                                          0.1, 1.0],
                                        'n_estimators': array([ 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,
        23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
        36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,...
       166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
       179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
       192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
       205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
       218, 219, 220, 22