In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Dataset description

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina

Value 2: atypical angina

Value 3: non-anginal pain

Value 4: asymptomatic

trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal

Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack


In [None]:
data=pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')


In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe().T

In [None]:
# Plotting histogram for the entire dataset
fig = plt.figure(figsize = (15,15))
ax = fig.gca()
g = data.hist(ax=ax)

In [None]:
sns.countplot(x='output',hue='sex',data=data)

In [None]:
women=data.loc[data.sex==1]['output']
rate_women=sum(women)/len(women) * 100
print(' % of women got heart attack : ', rate_women)

In [None]:
men=data.loc[data.sex==0]['output']
rate_men=sum(men)/len(men) * 100
print(' % of men got heart attack : ', rate_men)

In [None]:
# Visualization to check if the dataset is balanced or not
sns.countplot(x='output', data=data)


In [None]:
sns.countplot(x='output',hue='cp',data=data) #chest pain type

In [None]:
sns.countplot(x='output',hue='exng',data=data) #exercise induced angina

In [None]:
#(fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
sns.countplot(x='output',hue='fbs',data=data) 


In [None]:
sns.distplot(data['chol']) #cholestoral in mg/dl fetched via BMI sensor

In [None]:
sns.distplot(data['age'])

In [None]:
data.corr()

In [None]:
# Selecting correlated features using Heatmap

# Get correlation of all the features of the dataset
corr_matrix = data.corr()
top_corr_features = corr_matrix.index

# Plotting the heatmap
plt.figure(figsize=(20,20))
sns.heatmap(data=data[top_corr_features].corr(), annot=True, cmap='RdYlGn')

# Outliers Detection

In [None]:
for i in data.columns:
    sns.boxplot(data[i])
    plt.title(i)
    plt.show()

# Handling Outliers

In [None]:
def outlinefree(dataCol):
    sorted(dataCol)
        # getting percentile 25 and 27 that will help us for getting IQR (interquartile range)
    Q1,Q3 = np.percentile(dataCol,[25,75])
        # getting IQR (interquartile range)
    IQR = Q3-Q1
        # getting Lower range error
    LowerRange = Q1-(1.5 * IQR)
        # getting upper range error
    UpperRange = Q3+(1.5 * IQR)
        # return Lower range and upper range.
    return LowerRange,UpperRange

In [None]:
lwtrtbps,uptrtbps = outlinefree(data['trtbps'])
lwchol,upchol = outlinefree(data['chol'])
lwoldpeak,upoldpeak = outlinefree(data['oldpeak'])

In [None]:
data['trtbps'].replace(list(data[data['trtbps'] > uptrtbps].trtbps) ,uptrtbps,inplace=True)
data['chol'].replace(list(data[data['chol'] > upchol].chol) ,upchol,inplace=True)
data['oldpeak'].replace(list(data[data['oldpeak'] > upoldpeak].oldpeak) ,upoldpeak,inplace=True)

In [None]:
dataset = pd.get_dummies(data, columns=['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall'])

In [None]:
dataset.columns

In [None]:
from sklearn.preprocessing import StandardScaler
standScaler = StandardScaler()
columns_to_scale = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
dataset[columns_to_scale] = standScaler.fit_transform(dataset[columns_to_scale])

In [None]:
dataset.head()

# Feature Engineering

In [None]:
X=dataset.drop('output',axis=1).values
Y=dataset['output'].values

## Spliting data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=100)

# Model Building

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
reg= LogisticRegression()  
reg.fit(x_train, y_train)

In [None]:
predict1=reg.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
acc1=accuracy_score(predict1,y_test)
acc1

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predict1)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues')
cm

## KNeighbours Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier= KNeighborsClassifier() 
classifier.fit(x_train, y_train) 


In [None]:
predict2=classifier.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
acc2=accuracy_score(predict2,y_test)
acc2

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predict2)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues')
cm

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_classifier=DecisionTreeClassifier(criterion = 'entropy',max_depth=3,random_state=2)
tree_classifier.fit(x_train,y_train)

In [None]:
predict3=tree_classifier.predict(x_test)


In [None]:
from sklearn.metrics import accuracy_score
acc3=accuracy_score(predict3,y_test)
acc3

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predict3)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues')
cm

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
randomforest_model = RandomForestClassifier(max_depth=5, random_state=2)
randomforest_model.fit(x_train, y_train)

In [None]:
predict4=randomforest_model.predict(x_test)


In [None]:
from sklearn.metrics import accuracy_score
acc4=accuracy_score(predict4,y_test)
acc4

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predict4)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues')
cm

## Gradient Boosting

In [None]:
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier

seed = 7
num_trees = 50

In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
model.fit(x_train,y_train)
# results = model_selection.cross_val_score(model, x, y, cv=kfold)
# results

In [None]:
predict5=model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
acc5=accuracy_score(predict5,y_test)
acc5

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predict5)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues')
cm

# Evaluation

**Precision**

It is the number of correct positive results divided by the number of positive results predicted by the classifier.

**Recall**

It is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive).

**f1_score**

F1 Score is used to measure a test’s accuracy

F1 Score is the Harmonic Mean between precision and recall. It tells you how precise your classifier is (how many instances it classifies correctly), as well as how robust it is (it does not miss a significant number of instances).

In [None]:
from sklearn.metrics import precision_score,recall_score,f1_score,mean_absolute_error,roc_auc_score
def metric(y_test,y_predict):
    ps=precision_score(y_test,y_predict) #Precision
    rs=recall_score(y_test,y_predict) #Recall
    f1s=f1_score(y_test,y_predict) #f1Score
    return(ps,rs,f1s)

## for logistic regression

In [None]:
lr=list(metric(y_test,predict1))
lr.append(acc1)
lr=pd.Series(lr,index=['precession','Recall','F1_score','accuracy_score'])
lr

## for KNN Classifier

In [None]:
knn=list(metric(y_test,predict2))
knn.append(acc2)
knn=pd.Series(knn,index=['precession','Recall','F1_score','accuracy_score'])
knn

## for Decision Tree

In [None]:
dt=list(metric(y_test,predict3))
dt.append(acc3)
dt=pd.Series(dt,index=['precession','Recall','F1_score','accuracy_score'])
dt

## For Random Forest

In [None]:
rf=list(metric(y_test,predict4))
rf.append(acc4)
rf=pd.Series(rf,index=['precession','Recall','F1_score','accuracy_score'])
rf

## For Gradient Boosting

In [None]:
gb=list(metric(y_test,predict5))
gb.append(acc5)
gb=pd.Series(gb,index=['precession','Recall','F1_score','accuracy_score'])
gb

In [None]:
eval=pd.DataFrame([lr,knn,dt,rf,gb],index=['LogisticRegression','KNN_Classifier','Decision_Tree','Random Forest','Gradient Boosting'])

In [None]:
eval

# From the above observations we can finalize Logistic Regression or Random Forest as our best model