### All imports go into this field

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder

age - **age**

sex - 1:male 0:female

chest pain type (4 values) - **cp**

resting blood pressure - **trestbps**

serum cholestoral in mg/dl - **chol**

fasting blood sugar > 120 mg/dl - **fbs**

resting electrocardiographic results (values 0,1,2) - **restecg**

maximum heart rate achieved - **thalach**

exercise induced angina - **exang**

oldpeak = ST depression induced by exercise relative to rest - **oldpeak***

the slope of the peak exercise ST segment - **slope**

number of major vessels (0-3) colored by flourosopy - **ca**

thal: 3 = normal; 6 = fixed defect; 7 = reversable defect - **thal**

In [35]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [4]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [5]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [89]:
rfc = RandomForestClassifier()

In [101]:
X = df.drop(['target'],axis=1)
y = df.target

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)
X_eval, X_test, y_eval, y_test = train_test_split(X_test,y_test,test_size=0.5)

In [103]:
print(X_train.shape,X_eval.shape,X_test.shape)

(181, 13) (61, 13) (61, 13)


In [104]:
rfc.fit(X_train,y_train)
rfc.score(X_eval,y_eval)

0.7540983606557377

In [105]:
params = {
    'n_estimators': [50,100,200,1000],
    'max_depth': [3,5,7,9]
}


In [106]:
cv = GridSearchCV(rfc,params,cv=5)
cv.fit(X_test,y_test)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 7, 9],
                         'n_estimators': [50, 100, 200, 1000]})

In [107]:
cv.best_params_

{'max_depth': 3, 'n_estimators': 100}

In [166]:
rfc_best = RandomForestClassifier(n_estimators=100,max_depth=3)
rfc_best.fit(X_train,y_train)
rfc_best.score(X_eval,y_eval)

0.7868852459016393

In [167]:
y_pred = rfc_best.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[21, 12],
       [ 0, 28]], dtype=int64)

In [168]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.64      0.78        33
           1       0.70      1.00      0.82        28

    accuracy                           0.80        61
   macro avg       0.85      0.82      0.80        61
weighted avg       0.86      0.80      0.80        61



In [169]:
rfc_best.score(X_test,y_test)

0.8032786885245902

#### Base Model : 80%

### SVM

In [145]:
X_clarity = X_test.copy()
X_clarity['y_test'] = y_test
X_clarity['y_pred'] = y_pred

In [36]:
df_copy = df.copy()
df_copy.head(1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1


In [157]:
Randomer = df.sample(100)
Randomer_X = Randomer.drop(['target'],axis=1)
Randomer_Y = rfc_best.predict(Randomer_X)
confusion_matrix(Randomer.target,Randomer_Y)

array([[38, 13],
       [ 3, 46]], dtype=int64)

In [67]:
ss = StandardScaler()
one = OneHotEncoder(sparse=False)

In [66]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [68]:
SVM_X = df.drop(['target'],axis=1)
SVM_y = df.target

In [69]:
category = ['cp','fbs','restecg','exang','slope','thal','ca']
contin = SVM_X.columns.to_list()
continuous = list(set(contin) - set(category))
cat_scaled = one.fit_transform(SVM_X[category])
cont_scaled = ss.fit_transform(SVM_X[continuous])
SVM_X = np.concatenate([cat_scaled,cont_scaled],axis=1)
SVM_X

array([[ 0.        ,  0.        ,  0.        , ...,  0.76395577,
         1.08733806, -0.25633371],
       [ 0.        ,  0.        ,  1.        , ..., -0.09273778,
         2.12257273,  0.07219949],
       [ 0.        ,  1.        ,  0.        , ..., -0.09273778,
         0.31091206, -0.81677269],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.70684287,
         2.03630317, -1.029353  ],
       [ 1.        ,  0.        ,  0.        , ..., -0.09273778,
         0.13837295, -2.2275329 ],
       [ 0.        ,  1.        ,  0.        , ..., -0.09273778,
        -0.89686172, -0.19835726]])

In [70]:
SVM_X.shape

(303, 29)

In [71]:
SVM_X_train, SVM_X_test, SVM_y_train, SVM_y_test = train_test_split(SVM_X,SVM_y,test_size=0.4,random_state=42)
SVM_X_eval, SVM_X_test, SVM_y_eval, SVM_y_test = train_test_split(SVM_X_test,SVM_y_test,test_size=0.5,random_state=42)
print(SVM_X_train.shape,SVM_X_eval.shape,SVM_X_test.shape)

(181, 29) (61, 29) (61, 29)


In [72]:
svm = SVC()
svm.fit(SVM_X_train,SVM_y_train)
svm.score(SVM_X_eval,SVM_y_eval)

0.9180327868852459

In [73]:
params2 = {
    'C':[0.001,0.01,0.1,1,10],
    'kernel':['rbf','poly','linear']
}


In [74]:
cv2 = GridSearchCV(svm,params2,cv=5)
cv2.fit(SVM_X_train,SVM_y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ['rbf', 'poly', 'linear']})

In [75]:
cv2.best_params_

{'C': 0.1, 'kernel': 'linear'}

In [76]:
svm_best = SVC(C=0.1,kernel='linear')
svm_best.fit(SVM_X_train,SVM_y_train)
svm_best.score(SVM_X_eval,SVM_y_eval)

0.8688524590163934

In [77]:
SVM_y_pred = svm_best.predict(SVM_X_test)
confusion_matrix(SVM_y_test,SVM_y_pred)

array([[20,  4],
       [ 6, 31]], dtype=int64)

In [78]:
print(classification_report(SVM_y_test,SVM_y_pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80        24
           1       0.89      0.84      0.86        37

    accuracy                           0.84        61
   macro avg       0.83      0.84      0.83        61
weighted avg       0.84      0.84      0.84        61



In [79]:
svm_best.score(SVM_X_test,SVM_y_test)

0.8360655737704918

In [80]:
mpl = MLPClassifier()
mpl.fit(SVM_X_train,SVM_y_train)
mpl.score(SVM_X_eval,SVM_y_eval)



0.9180327868852459

In [81]:
params3 = {
    'hidden_layer_sizes':[(50,),(50,2),(100,),(100,3),(100,2)],
    'activation':['relu','tanh','logistic'],
    'learning_rate':['constant','invscaling','adaptive']
}

In [82]:
cv3 = GridSearchCV(mpl,params3,cv=5)
cv3.fit(SVM_X_train,SVM_y_train)















GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'activation': ['relu', 'tanh', 'logistic'],
                         'hidden_layer_sizes': [(50,), (50, 2), (100,),
                                                (100, 3), (100, 2)],
                         'learning_rate': ['constant', 'invscaling',
                                           'adaptive']})

In [84]:
cv3.best_params_

{'activation': 'relu',
 'hidden_layer_sizes': (50,),
 'learning_rate': 'invscaling'}

In [85]:
mlp_best = MLPClassifier(activation='relu',hidden_layer_sizes=(50,),learning_rate='invscaling')
mlp_best.fit(SVM_X_train,SVM_y_train)
mlp_best.score(SVM_X_eval,SVM_y_eval)



0.9180327868852459

In [86]:
mlp_y_pred = mlp_best.predict(SVM_X_test)
confusion_matrix(SVM_y_test,mlp_y_pred)

array([[19,  5],
       [ 3, 34]], dtype=int64)

In [87]:
mlp_best.score(SVM_X_test,SVM_y_test)

0.8688524590163934

In [88]:
print(classification_report(SVM_y_test,mlp_y_pred))

              precision    recall  f1-score   support

           0       0.86      0.79      0.83        24
           1       0.87      0.92      0.89        37

    accuracy                           0.87        61
   macro avg       0.87      0.86      0.86        61
weighted avg       0.87      0.87      0.87        61



In [162]:
gbc = GradientBoostingClassifier()
gbc.fit(X_test,y_test)
gbc.score(X_eval,y_eval)

0.7868852459016393

In [163]:
params4 = {
    'learning_rate':[0.01,0.1,1,10,100],
    'n_estimators':[50,100,200,1000],
    'max_depth':[3,5,7,9]
}

In [164]:
cv4 = GridSearchCV(gbc,params4,cv=5)
cv4.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1, 10, 100],
                         'max_depth': [3, 5, 7, 9],
                         'n_estimators': [50, 100, 200, 1000]})

In [165]:
cv4.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

In [171]:
gbc_best = GradientBoostingClassifier(learning_rate = 0.1,n_estimators= 200,max_depth= 5)
gbc_best.fit(X_train,y_train)
gbc_best.score(X_eval,y_eval)

0.8524590163934426

In [172]:
gbc_y_pred = gbc_best.predict(X_test)
confusion_matrix(y_test,gbc_y_pred)

array([[17,  7],
       [ 9, 28]], dtype=int64)

In [173]:
print(classification_report(y_test,gbc_y_pred))

              precision    recall  f1-score   support

           0       0.65      0.71      0.68        24
           1       0.80      0.76      0.78        37

    accuracy                           0.74        61
   macro avg       0.73      0.73      0.73        61
weighted avg       0.74      0.74      0.74        61



In [174]:
gbc_best.score(X_test,y_test)

0.7377049180327869

## Rankings

1) Random Forest Classifier - 92%

2) SVM - 83.6%

3) MLP - 86.9%

4) GBC - 73.7%

Simple model Triumph