# Heart Attack Prediction
- Objectives : To predict person who have chance to get heart disease

### Import library

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

#Apply a fix to the statsmodels library
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq,df)

### Load the data

In [2]:
raw_data = pd.read_csv('heart.csv')
raw_data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### Data description
1. age - age in years

2. sex - sex (1 = male; 0 = female)

3. cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 0 = asymptomatic)

4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)

5. chol - serum cholestoral in mg/dl

6. fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

7. restecg - resting electrocardiographic results (1 = normal; 2 = having ST-T wave abnormality; 0 = hypertrophy)

8. thalach - maximum heart rate achieved

9. exang - exercise induced angina (1 = yes; 0 = no)

10. oldpeak - ST depression induced by exercise relative to rest

11. slope - the slope of the peak exercise ST segment (2 = upsloping; 1 = flat; 0 = downsloping)

12. ca - number of major vessels (0-3) colored by flourosopy

13. thal - 2 = normal; 1 = fixed defect; 3 = reversable defect

14. num - the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = < diameter narrowing; Value 1 = > 50% diameter narrowing)

### Checking the missing values

In [3]:
raw_data.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

### Variables

In [5]:
y = raw_data['output']
x1 = raw_data.drop(['output'],axis=1)

### Regression

In [6]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
result_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.348904
         Iterations 7


### Summary

In [9]:
result_log.summary()

0,1,2,3
Dep. Variable:,output,No. Observations:,303.0
Model:,Logit,Df Residuals:,289.0
Method:,MLE,Df Model:,13.0
Date:,"Fri, 01 Sep 2023",Pseudo R-squ.:,0.4937
Time:,16:30:49,Log-Likelihood:,-105.72
converged:,True,LL-Null:,-208.82
Covariance Type:,nonrobust,LLR p-value:,7.262e-37

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.4505,2.571,1.342,0.180,-1.590,8.490
age,-0.0049,0.023,-0.212,0.832,-0.050,0.041
sex,-1.7582,0.469,-3.751,0.000,-2.677,-0.839
cp,0.8599,0.185,4.638,0.000,0.496,1.223
trtbps,-0.0195,0.010,-1.884,0.060,-0.040,0.001
chol,-0.0046,0.004,-1.224,0.221,-0.012,0.003
fbs,0.0349,0.529,0.066,0.947,-1.003,1.073
restecg,0.4663,0.348,1.339,0.181,-0.216,1.149
thalachh,0.0232,0.010,2.219,0.026,0.003,0.044


### Prediction

In [10]:
np.set_printoptions(formatter = {'float' : lambda x : "{0:0.2f}".format(x)})
result_log.predict()

array([0.81, 0.66, 0.96, 0.91, 0.82, 0.76, 0.83, 0.85, 0.79, 0.92, 0.62,
       0.98, 0.88, 0.73, 0.98, 0.97, 0.99, 0.84, 0.60, 0.94, 0.38, 0.91,
       0.84, 0.50, 0.85, 0.78, 0.87, 0.93, 0.88, 0.69, 0.98, 0.54, 0.92,
       0.53, 0.59, 0.82, 0.99, 0.64, 0.97, 0.92, 0.86, 0.82, 0.13, 0.79,
       0.93, 0.90, 0.95, 0.91, 0.99, 0.92, 0.98, 0.43, 0.11, 0.99, 0.99,
       0.74, 0.85, 0.84, 0.99, 0.82, 0.96, 0.79, 0.99, 0.88, 0.93, 0.95,
       0.75, 0.95, 0.94, 0.97, 0.67, 0.74, 0.95, 0.55, 0.99, 0.88, 0.72,
       0.76, 0.95, 0.53, 0.98, 0.85, 0.98, 0.82, 0.79, 0.67, 0.65, 0.88,
       0.98, 0.76, 0.88, 0.43, 0.52, 0.81, 0.97, 0.09, 0.59, 0.15, 0.73,
       0.64, 0.83, 0.09, 0.94, 0.84, 0.97, 0.90, 0.68, 0.66, 0.97, 0.95,
       0.63, 0.85, 0.93, 0.74, 0.88, 1.00, 0.78, 0.80, 0.99, 0.68, 0.21,
       0.74, 0.98, 0.99, 1.00, 0.99, 0.81, 0.97, 0.98, 0.66, 0.98, 0.96,
       0.90, 0.92, 0.98, 0.95, 0.98, 0.81, 0.41, 0.03, 0.98, 0.65, 0.99,
       0.81, 0.95, 0.67, 0.96, 0.99, 0.97, 0.96, 0.

In [11]:
np.round(result_log.predict())

array([1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.00, 0.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 0.00, 1.00, 1.00, 1.00, 0.00, 1.00, 0.00, 1.00,
       1.00, 1.00, 0.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.00, 0.00, 1.00, 1.00, 1.00,
       1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.

### Comparing the prediction with the actual outcomes

In [13]:
raw_data['output_predict'] = np.round(result_log.predict())

In [14]:
raw_data

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,output_predict
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,1.0
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,1.0
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,1.0
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,1.0
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,0.0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,1.0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,0.0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,0.0


### Confusion Matrix

In [15]:
cm_df = pd.DataFrame(result_log.pred_table())
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,106.0,32.0
Actual 1,13.0,152.0


In [17]:
print('Missclasification rate : ' + str((13+32)/303))

Missclasification rate : 0.1485148514851485


### Calculate the accuracy

In [20]:
cm = np.array(cm_df)
accuracy_train = (106 + 152) / 303
accuracy_train

0.8514851485148515