In this kernel we shall look into the 'Heart-healthcare' dataset in which we shall predict possibilty of Heart Attack given the different predictors. 
I have used 4 different classification algorithms namely:
1. Logistic Regression
2. Gaussian Naive Bayes
3. Support Vector Machines
4. Random Forests Classifier

I have also used extensive Grid Search CV for hyperparameter tuning.
Hope my notebook helps. 🤗🤗

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Sneek-Peek

In [None]:
df = pd.read_csv('/kaggle/input/health-care-data-set-on-heart-attack-possibility/heart.csv')
df.head()

In [None]:
df.info()

Absolutely Clean data

In [None]:
df.describe()

* We shall need to normalize/scale the data since it looks highly skewed

In [None]:
sns.countplot(df['target'])

In [None]:
sns.countplot(df['sex'])

In [None]:
sns.pairplot(data=df)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='Blues')

In [None]:
df.isna().sum()

In [None]:
y = df['target']
X = df.drop('target', axis=1)

Train_Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=3, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
best_model1 = LogisticRegression(C=0.01, solver='liblinear', penalty = 'l2')
best_model1.fit(X_train, y_train)
y_pred = best_model1.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve

print(confusion_matrix(y_test, y_pred))
print('Accuracy of our model is: ', accuracy_score(y_test, y_pred))

In [None]:
lr_acc = accuracy_score(y_test, y_pred)*100
lr_acc

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [None]:
nb1 = GaussianNB()
nb1.fit(X_train, y_train)
y_pred = nb1.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy of our model is: ', accuracy_score(y_test, y_pred))

In [None]:
nb_acc = accuracy_score(y_test, y_pred)*100
nb_acc

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf1 = RandomForestClassifier()

In [None]:
n_estimators = [200, 300, 500]
max_depth = [4,6,5, 8]
min_samples_split = [8,9,7]

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split)

gridF = GridSearchCV(rf1, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_train, y_train)

In [None]:
# summarize results
print("Best: %f using %s" % (bestF.best_score_, bestF.best_params_))
means = bestF.cv_results_['mean_test_score']
stds = bestF.cv_results_['std_test_score']
params = bestF.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
rf_best = RandomForestClassifier(max_depth= 8, min_samples_split= 8, n_estimators= 500)

In [None]:
rf_best.fit(X_train, y_train)
y_pred = rf_best.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy of our model is: ', accuracy_score(y_test, y_pred))

In [None]:
rf_acc = accuracy_score(y_test, y_pred)*100
rf_acc

In [None]:
df.columns

In [None]:
importance = rf_best.feature_importances_
indices = np.argsort(importance)[::-1]
feature_names = X.columns # e.g. ['A', 'B', 'C', 'D', 'E']

f, ax = plt.subplots(figsize=(11, 9))
plt.title("Feature ranking", fontsize = 20)
plt.bar(range(X.shape[1]), importance[indices],
    color="b", 
    align="center")
plt.xticks(range(X.shape[1]), feature_names)
plt.xlim([-1, X.shape[1]])
plt.ylabel("importance", fontsize = 18)
plt.xlabel("index of the feature", fontsize = 18)


In [None]:
from sklearn.svm import SVC 

from sklearn.model_selection import GridSearchCV 

# defining parameter range 
param_grid = {'C': [ 1, 10, 100], 
            'gamma': [ 0.1, 0.01, 0.001,], 
            'kernel': ['rbf', 'poly']} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=3) 

# fitting the model for grid search 
grid.fit(X_train, y_train) 


In [None]:
# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 


In [None]:
y_test.shape

In [None]:
y_pred = grid.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy of our model is: ', accuracy_score(y_test, y_pred))



In [None]:
svm_acc = accuracy_score(y_test, y_pred)*100
svm_acc

In [None]:
m1 = 'Logistic Regression'
m2 = 'Gaussian Naive Bayes'
m3 = 'Random Forests'
m4 = 'Support Vector Classifiers'

In [None]:
colors = ["orange", "green", "magenta", "red"]
acc = [lr_acc,nb_acc,rf_acc, svm_acc]
m = [m1,m2,m3, m4]
plt.figure(figsize=(10,5))
plt.yticks(np.arange(0,100,10))
plt.title("barplot Represent Accuracy of different models")
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot( y=acc,x=m, palette=colors)
plt.show()

In [None]:
print(acc)

1. Both SVM and Random Forests have the same accuracy scores, they differ in their predictions by one data points

**Credits - https://www.kaggle.com/nareshbhat/complete-data-analysis-a-to-z/notebook**

Please upvote if you like and comment for any explanations 