In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore") 

#### Below is a brief information about each columns of the dataset:
`age` - Age of the patient

`sex` - Sex of the patient

`cp` - Chest pain type ~ 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic

`trtbps` - Resting blood pressure (in mm Hg)

`chol` - Cholestoral in mg/dl fetched via BMI sensor

`fbs` - (fasting blood sugar > 120 mg/dl) ~ 1 = True, 0 = False

`restecg` - Resting electrocardiographic results ~ 0 = Normal, 1 = ST-T wave normality, 2 = Left ventricular hypertrophy

`thalachh`  - Maximum heart rate achieved

`oldpeak` - Previous peak

`slp` - Slope

`caa` - Number of major vessels 

`thall` - Thalium Stress Test result ~ (0,3)

`exng` - Exercise induced angina ~ 1 = Yes, 0 = No

`output` - Target variable

#### Task 
To perform EDA and predict if a person is prone to a heart attack or not.

In [None]:
df = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
df.shape

In [None]:
df.sample(10)

In [None]:
df.describe(include='all')

##### Checking the number of unique values in each column

In [None]:
dict = {}
for i in list(df.columns):
    dict[i] = df[i].value_counts().shape[0]

pd.DataFrame(dict,index=["unique count"]).transpose()

#### Check whether ther is missing data or not

In [None]:
df.isnull().sum()


#### In this section, we analyze relationship between different features with respect to **Output** (heart attack). We see how different feature values show different heart attack chance. We also plot different kinds of diagrams to visualize our data and findings.

### We will refer to **heart attack** with the keyword **Output**

#### Check proportions between people who had heart attack and those who did not

In [None]:
output_pos = df[df.output == 1]
output_neg = df[df.output == 0]

output_pos_percentage = round(float((len(output_pos) / len(df) * 100)), 2)
output_neg_percentage = round(float((len(output_neg) / len(df) * 100)),2)

print(f'People who had heart attack: {len(output_pos)} ({output_pos_percentage} %) ')
print(f'People who had not heart attack: {len(output_neg)} ({output_neg_percentage} %)')

### Age vs Output

In [None]:
plt.figure(figsize=(15, 5))

sns.kdeplot(data=df, x="age", hue="output")

#### NOTE:

>People between age 40-55 are more likely to have heart disease 

>whereas people between age 55-60 are less likely to have it. 

### Sex vs Output

In [None]:
df.sex.value_counts()

In [None]:
pd.crosstab(df['output'], df['sex'])

In [None]:
plt.title('Distribution of cases (heart attacks) between Sex 0 and 1')
sns.barplot(x='sex', y='output', data=df)

In [None]:
pd.crosstab(df['output'], df['sex']).plot(kind='bar')

In [None]:
sex_output_pos_average = df[['sex', 'output']].groupby(['sex'], as_index=False).mean()
sex_output_pos_average

#### NOTE:

>People in category **sex = 1** are less likely to have heart attack

>Around 45% of chanses for people in that category

>Around 75% of chanses to have heart attack for people in the category **sex = 0**

### CP vs Output

In [None]:
df.cp.value_counts()

In [None]:
pd.crosstab(df['output'], df['cp'])

In [None]:
cp_output_pos_average = df[['cp', 'output']].groupby(['cp'], as_index=False).mean()
cp_output_pos_average

In [None]:
sns.barplot(x='cp', y='output', ci=None, data=df)

#### NOTE:<br>
- For poeple in the group **cp = 0**, chances to having heart attack are much lower. 
- The rest 3 groups are peaking around 75% in average.

### CHOL vs Output & TRTBPS vs Output

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))

sns.kdeplot(data=df, x="trtbps", hue="output", ax=axs[0]).set_title('Blood presure')
sns.kdeplot(data=df, x="chol", hue="output", ax=axs[1]).set_title('Cholesterol')
fig.show()

#### NOTE:<br>
- People having cholestrol 150-250 and blood presure between 110 to 140 are more likely to have heart attack.

### THALACHH vs Output

In [None]:
plt.figure(figsize=(15, 5))

sns.kdeplot(data=df, x="thalachh", hue="output")

#### NOTE:<br>
- People with higher heart rate (150 - 180) are very much probable to suffer from heart attack.

### EXNG vs Output

In [None]:
df.exng.value_counts()

In [None]:
pd.crosstab(df['output'], df['exng'])

In [None]:
sns.barplot(x='exng', y='output', data=df)

In [None]:
exng_output_pos_average = df[['exng', 'output']].groupby(['exng'], as_index=False).mean()
exng_output_pos_average

#### NOTE:<br>
- People without exercise induced angina are more probable to suffer from heart disease.
- Chances are around 70%
- ans only 30% respectively, for those with exercise induced angina

### OLDPEACK vs Output

In [None]:
plt.figure(figsize=(15, 5))

sns.kdeplot(data=df, x="oldpeak", hue="output")

#### NOTE:<br>
- People with lower pevious peak achieved have higher chances of heart attack.

### SLP vs Output

In [None]:
pd.crosstab(df['output'], df['slp'])

In [None]:
slp_output_pos_average = df[['slp', 'output']].groupby(['slp'], as_index=False).mean()
slp_output_pos_average

In [None]:
# We can verify the statement above on this chart
# Chances to have the stroke depending on slp

slp_output = df.groupby('slp').output.value_counts()
slp_output.unstack(level=0).plot(kind='bar', subplots=False)

#### NOTE:<br>
- Chances to have heart attack for people in the group **spl == 2** are much higher than for the groups 0 and 1.

### CAA vs Output

In [None]:
pd.crosstab(df['output'], df['caa'])

In [None]:
caa_output_pos_average = df[['caa', 'output']].groupby(['caa'], as_index=False).mean()
caa_output_pos_average

In [None]:
# We can verify the statement above on this chart
# Chances to have the stroke depending on caa

caa_output = df.groupby('caa').output.value_counts()
caa_output.unstack(level=0).plot(kind='bar', subplots=False)

#### NOTE:<br>
- There is a lackage of data for people in group **caa = 4, 3 and 2**
- Yet, it's clear that for the group 0, the chances are about 75%

### THALL vs Output

In [None]:
pd.crosstab(df['output'], df['thall'])

In [None]:
thall_output_pos_average = df[['thall', 'output']].groupby(['thall'], as_index=False).mean()
thall_output_pos_average

In [None]:
# We can verify the statement above on this chart
# Chances to have the stroke depending on thall

thall_output = df.groupby('thall').output.value_counts()
thall_output.unstack(level=0).plot(kind='bar', subplots=False)

#### NOTE:<br>
- Due to the lackage of the data for the groups **thall = 0 and 1**
- Only the groups 2 and 3 should be taken in count
- And chances of heart attack are much higher for the group 2

### Features that don't affect to the Output

In [None]:
plt.figure(figsize=(15, 5))

sns.kdeplot(data=df, x="trtbps", hue="output")

In [None]:
fbs_output_pos_average = df[['fbs', 'output']].groupby(['fbs'], as_index=False).mean()
fbs_output_pos_average

In [None]:
pd.crosstab(df['output'], df['restecg'])

In [None]:
restecg_output_pos_average = df[['restecg', 'output']].groupby(['restecg'], as_index=False).mean()
restecg_output_pos_average

#### NOTE <br>
- In case of **trtbps** positive and negative result of **Output** are similar
- In case of **fbs**, the same - 55% vs 51% 
- In case of **restecg** - there are lackage of the data for the option **2**. The rest of the options **0 and 1** are 46% vs 63%. Not so far from 50%...  

### Correlation

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(df.corr(), vmax=0.6, square=True, annot=True)

NOTE:<br>Heatmap of Correlation between different features:

>Positive numbers = Positive correlation, i.e. increase in one feature will increase the other feature & vice-versa.<br>
>Negative numbers = Negative correlation, i.e. increase in one feature will decrease the other feature & vice-versa.

In our case, we focus on which features have strong positive or negative correlation with the *Output* feature.

### Checking for Outliers

In [None]:
df.hist(figsize=(15,10));

#### Nothing that could affect the process

### Conclusions from the EDA

1. There are no NaN values in the data.
2. All the data is numeric, yet some columns need to be **scaled**
3. The data consists of more than twice the number of people with `sex` = 1 than `sex` = 0. So the accuracy of the models for people coresponding to the group **sex = 0** might be slighly different / lower.
4. It is intuitive that elder people might have higher chances of heart attack but according to the distribution plot of `age` vs `output`, it is evident that this isn't the case.



### Feature Selection

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split

# Since the [outliers] aren't an issue, we can skip RobustScaler
from sklearn.preprocessing import StandardScaler


# Importing Classifier Modules
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# creating a copy of df
df1 = df

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df1, columns = cat_cols, drop_first = True)

# defining the features and target
X = df1.drop('output', axis=1)
y = df1['output']

# instantiating the scaler
scaler = StandardScaler()

# scaling the continuous featuree
X[con_cols] = scaler.fit_transform(X[con_cols])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)


### Predictions

#### 1) Logistic Regression

[Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression), or logit regression, or logit model is a regression model where the dependent variable (DV) is categorical. This article covers the case of a binary dependent variable—that is, where it can take only two values, "0" and "1", which represent outcomes such as pass/fail, win/lose, alive/dead or healthy/sick. Cases where the dependent variable has more than two outcome categories may be analysed in multinomial logistic regression, or, if the multiple categories are ordered, in ordinal logistic regression.

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_log_reg = clf.predict(X_test)
acc_log_reg = round( clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_log_reg) + '%')

#### 2) Support Vector Machine (SVM)

[Support Vector Machine (SVM)](https://en.wikipedia.org/wiki/Support_vector_machine) model is a Supervised Learning model used for classification and regression analysis. It is a representation of the examples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. New examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall.

In addition to performing linear classification, SVMs can efficiently perform a non-linear classification using what is called the kernel trick, implicitly mapping their inputs into high-dimensional feature spaces. Suppose some given data points each belong to one of two classes, and the goal is to decide which class a new data point will be in. In the case of support vector machines, a data point is viewed as a $p$-dimensional vector (a list of $p$ numbers), and we want to know whether we can separate such points with a $(p-1)$-dimensional hyperplane.

When data are not labeled, supervised learning is not possible, and an unsupervised learning approach is required, which attempts to find natural clustering of the data to groups, and then map new data to these formed groups. The clustering algorithm which provides an improvement to the support vector machines is called **support vector clustering** and is often used in industrial applications either when data are not labeled or when only some data are labeled as a preprocessing for a classification pass.

In the below code, [SVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) stands for Support Vector Classification.

In [None]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)
acc_svc = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_svc) + '%')

#### 3) Linear SVM

Linear SVM is a SVM model with linear kernel.<br>
In the below code, [LinearSVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html) stands for Linear Support Vector Classification.

In [None]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred_linear_svc = clf.predict(X_test)
acc_linear_svc = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_linear_svc) + '%')

#### 4) $k$-Nearest Neighbors

[$k$-nearest neighbors algorithm (k-NN)](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) is one of the simplest machine learning algorithms and is used for classification and regression. In both cases, the input consists of the $k$ closest training examples in the feature space. The output depends on whether $k$-NN is used for classification or regression:

- In *$k$-NN classification*, the output is a class membership. An object is classified by a majority vote of its neighbors, with the object being assigned to the class most common among its $k$ nearest neighbors ($k$ is a positive integer, typically small). If $k = 1$, then the object is simply assigned to the class of that single nearest neighbor.


- In *$k$-NN regression*, the output is the property value for the object. This value is the average of the values of its $k$ nearest neighbors.

In [None]:
clf = KNeighborsClassifier(n_neighbors = 3)
clf.fit(X_train, y_train)
y_pred_knn = clf.predict(X_test)
acc_knn = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_knn) + '%')

#### 5) Decision Tree

A [decision tree](https://en.wikipedia.org/wiki/Decision_tree) is a flowchart-like structure in which each internal node represents a "test" on an attribute (e.g. whether a coin flip comes up heads or tails), each branch represents the outcome of the test, and each leaf node represents a class label (decision taken after computing all attributes). The paths from root to leaf represent classification rules.

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred_decision_tree = clf.predict(X_test)
acc_decision_tree = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_decision_tree) + '%')

#### 6) Random Forest

[Random forests](https://en.wikipedia.org/wiki/Random_forest) or **random decision forests** are an **ensemble learning method** for classification, regression and other tasks, that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. Random decision forests correct for *decision trees' habit of overfitting to their training set*.

[Ensemble methods](https://en.wikipedia.org/wiki/Ensemble_learning) use multiple learning algorithms to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone.

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred_random_forest = clf.predict(X_test)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_random_forest) + '%')

In [None]:
models = pd.DataFrame({
    'Model': ['LR', 'SVM', 'L-SVC', 
              'KNN', 'DTree', 'RF',],
    
    'Score': [acc_log_reg, acc_svc, acc_linear_svc, 
              acc_knn,  acc_decision_tree, acc_random_forest]
    })

models = models.sort_values(by='Score', ascending=False)
models