### All imports go into this field

In [159]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC


age - **age**

sex - 1:male 0:female

chest pain type (4 values) - **cp**

resting blood pressure - **trestbps**

serum cholestoral in mg/dl - **chol**

fasting blood sugar > 120 mg/dl - **fbs**

resting electrocardiographic results (values 0,1,2) - **restecg**

maximum heart rate achieved - **thalach**

exercise induced angina - **exang**

oldpeak = ST depression induced by exercise relative to rest - **oldpeak***

the slope of the peak exercise ST segment - **slope**

number of major vessels (0-3) colored by flourosopy - **ca**

thal: 3 = normal; 6 = fixed defect; 7 = reversable defect - **thal**

In [126]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [14]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [17]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [22]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [53]:
rfc = RandomForestClassifier()

In [56]:
X = df.drop(['target'],axis=1)
y = df.target

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test,y_test,test_size=0.5,random_state=42)

In [60]:
print(X_train.shape,X_eval.shape,X_test.shape)

(181, 13) (61, 13) (61, 13)


In [61]:
rfc.fit(X_train,y_train)
rfc.score(X_eval,y_eval)

0.8524590163934426

In [65]:
params = {
    'n_estimators': [50,100,200,1000],
    'max_depth': [3,5,7,9]
}


In [69]:
cv = GridSearchCV(rfc,params,cv=5)
cv.fit(X_test,y_test)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 7, 9],
                         'n_estimators': [50, 100, 200, 1000]})

In [70]:
cv.best_params_

{'max_depth': 3, 'n_estimators': 100}

In [73]:
rfc_best = RandomForestClassifier(n_estimators=100,max_depth=3)
rfc_best.fit(X_test,y_test)
rfc_best.score(X_eval,y_eval)

0.8524590163934426

In [75]:
y_pred = rfc_best.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[21,  3],
       [ 1, 36]], dtype=int64)

In [77]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91        24
           1       0.92      0.97      0.95        37

    accuracy                           0.93        61
   macro avg       0.94      0.92      0.93        61
weighted avg       0.94      0.93      0.93        61



In [78]:
rfc_best.score(X_test,y_test)

0.9344262295081968

#### Base Model : 93%

### SVM

In [127]:
df_copy = df.copy()
df_copy.head(1)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1


In [128]:
def encode(df,row_name):
    dv1 = pd.get_dummies(df[row_name])
    func = lambda x : row_name + '_' + str(x)
    dv1.columns = dv1.columns.to_series().apply(func)
    df = pd.concat([df,dv1],axis=1)
    df = df.drop(columns=row_name)
    return df

In [129]:
category = ['cp','fbs','restecg','exang','slope','thal','ca']
for c in category:
    df = encode(df,c)
df.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_0,cp_1,cp_2,...,slope_2,thal_0,thal_1,thal_2,thal_3,ca_0,ca_1,ca_2,ca_3,ca_4
0,63,1,145,233,150,2.3,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,37,1,130,250,187,3.5,1,0,0,1,...,0,0,0,1,0,1,0,0,0,0
2,41,0,130,204,172,1.4,1,0,1,0,...,1,0,0,1,0,1,0,0,0,0
3,56,1,120,236,178,0.8,1,0,1,0,...,1,0,0,1,0,1,0,0,0,0
4,57,0,120,354,163,0.6,1,1,0,0,...,1,0,0,1,0,1,0,0,0,0


In [134]:
SVM_X = df.drop('target',axis=1)
SVM_y = df.target
SVM_X_train, SVM_X_test, SVM_y_train, SVM_y_test = train_test_split(SVM_X,SVM_y,test_size=0.4,random_state=42)
SVM_X_eval, SVM_X_test, SVM_y_eval, SVM_y_test = train_test_split(SVM_X_test,SVM_y_test,test_size=0.5,random_state=42)
print(SVM_X_train.shape,SVM_X_eval.shape,SVM_X_test.shape)

(181, 29) (61, 29) (61, 29)


In [135]:
svm = SVC()
svm.fit(SVM_X_train,SVM_y_train)
svm.score(SVM_X_eval,SVM_y_eval)

0.6885245901639344

In [137]:
params2 = {
    'C':[0.001,0.01,0.1,1,10],
    'kernel':['rbf','poly','linear']
}


In [138]:
cv2 = GridSearchCV(svm,params2,cv=5)
cv2.fit(SVM_X_train,SVM_y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'kernel': ['rbf', 'poly', 'linear']})

In [139]:
cv2.best_params_

{'C': 0.1, 'kernel': 'linear'}

In [140]:
svm_best = SVC(C=0.1,kernel='linear')
svm_best.fit(SVM_X_train,SVM_y_train)
svm_best.score(SVM_X_eval,SVM_y_eval)

0.8524590163934426

In [141]:
SVM_y_pred = svm_best.predict(SVM_X_test)
confusion_matrix(SVM_y_test,SVM_y_pred)

array([[19,  5],
       [ 6, 31]], dtype=int64)

In [143]:
print(classification_report(SVM_y_test,SVM_y_pred))

              precision    recall  f1-score   support

           0       0.76      0.79      0.78        24
           1       0.86      0.84      0.85        37

    accuracy                           0.82        61
   macro avg       0.81      0.81      0.81        61
weighted avg       0.82      0.82      0.82        61



In [176]:
svm_best.score(SVM_X_test,SVM_y_test)

0.819672131147541

In [177]:
mpl = MLPClassifier()
mpl.fit(SVM_X_train,SVM_y_train)
mpl.score(SVM_X_eval,SVM_y_eval)

0.5737704918032787

In [178]:
params3 = {
    'hidden_layer_sizes':[(50,),(50,2),(100,),(100,3),(100,2)],
    'activation':['relu','tanh','logistic'],
    'learning_rate':['constant','invscaling','adaptive']
}

In [179]:
cv3 = GridSearchCV(mpl,params3,cv=5)
cv3.fit(SVM_X_train,SVM_y_train)













GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'activation': ['relu', 'tanh', 'logistic'],
                         'hidden_layer_sizes': [(50,), (50, 2), (100,),
                                                (100, 3), (100, 2)],
                         'learning_rate': ['constant', 'invscaling',
                                           'adaptive']})

In [180]:
cv3.best_params_

{'activation': 'relu',
 'hidden_layer_sizes': (100,),
 'learning_rate': 'adaptive'}

In [182]:
mlp_best = MLPClassifier(activation='relu',hidden_layer_sizes=(100,),learning_rate='adaptive')
mlp_best.fit(SVM_X_train,SVM_y_train)
mlp_best.score(SVM_X_eval,SVM_y_eval)



0.8360655737704918

In [183]:
mlp_y_pred = mlp_best.predict(SVM_X_test)
confusion_matrix(SVM_y_test,mlp_y_pred)

array([[16,  8],
       [ 6, 31]], dtype=int64)

In [184]:
mlp_best.score(SVM_X_test,SVM_y_test)

0.7704918032786885

In [185]:
print(classification_report(SVM_y_test,mlp_y_pred))

              precision    recall  f1-score   support

           0       0.73      0.67      0.70        24
           1       0.79      0.84      0.82        37

    accuracy                           0.77        61
   macro avg       0.76      0.75      0.76        61
weighted avg       0.77      0.77      0.77        61



In [162]:
gbc = GradientBoostingClassifier()
gbc.fit(X_test,y_test)
gbc.score(X_eval,y_eval)

0.7868852459016393

In [163]:
params4 = {
    'learning_rate':[0.01,0.1,1,10,100],
    'n_estimators':[50,100,200,1000],
    'max_depth':[3,5,7,9]
}

In [164]:
cv4 = GridSearchCV(gbc,params4,cv=5)
cv4.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.1, 1, 10, 100],
                         'max_depth': [3, 5, 7, 9],
                         'n_estimators': [50, 100, 200, 1000]})

In [165]:
cv4.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}

In [171]:
gbc_best = GradientBoostingClassifier(learning_rate = 0.1,n_estimators= 200,max_depth= 5)
gbc_best.fit(X_train,y_train)
gbc_best.score(X_eval,y_eval)

0.8524590163934426

In [172]:
gbc_y_pred = gbc_best.predict(X_test)
confusion_matrix(y_test,gbc_y_pred)

array([[17,  7],
       [ 9, 28]], dtype=int64)

In [173]:
print(classification_report(y_test,gbc_y_pred))

              precision    recall  f1-score   support

           0       0.65      0.71      0.68        24
           1       0.80      0.76      0.78        37

    accuracy                           0.74        61
   macro avg       0.73      0.73      0.73        61
weighted avg       0.74      0.74      0.74        61



In [174]:
gbc_best.score(X_test,y_test)

0.7377049180327869

## Rankings

1) Random Forest Classifier - 93%

2) SVM - 82%

3) MLP - 77%

4) GBC - 74%

Simple model Triumph