In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
data.head()

## About Feature
    |
    
    1. Age      :  Age of patient
    2. Sex      :  Gender
    3. cp       :  Chest Pain Intensity
    4. trestbps :  Resting Blood Pressure
                   Measure of BP when after resting for few minutes.
                   High RBP can be indication of some disese.
    5. chol     :  Cholestrol
    6. fbs      :  Fasting Blood Sugar
                   High fasting blood sugar levels point to insulin resistance or diabetes,
                   while abnormally low fasting blood sugar could be due to diabetes medications.
    7. restecg  :  A resting ECG is administered when the patient is at rest.
                   The dimensions and regularity or irregularity of these lines,
                   communicates the nature of the patient’s heart activity. 
    8. thalach  :  Max Heart Rate
    9. exang    :  Angina is generally caused from not getting enough blood through the arteries.
                   Angina can be caused by blockage, injury or spasms.
    10. oldpeak :  ST depression refers to a finding on an electrocardiogram,
                   wherein the trace in the ST segment is abnormally low below the baseline.
                   During Exercise realtive to Rest
    11. slope   :  the slope of the peak exercise ST segment
    12. ca      :  number of major vessels (0-3) colored by flourosopy
    13. thal    :  A thallium stress test is a nuclear imaging test that shows how well blood flows into your heart.
    
    |

In [None]:
data.columns = ["Age","Sex","Chest Pain","Resting BP","Cholestrol","Fasting BS","Resting ECG","Max HR","Exercise Angina","ST Depression","ST Seg Slope","Ca","Th Stress Test","Disease"]

# Data Analysis

In [None]:
data.describe()

In [None]:
data.info()

### Correlation

In [None]:
corr = abs(data.corr())
corr_with_sorted = corr[['Disease']].sort_values(by='Disease',ascending=False)
corr_matrix = corr.loc[corr_with_sorted.index][corr_with_sorted.index]
mask = np.triu(np.ones_like(corr_matrix,dtype=np.bool))

In [None]:
plt.figure(figsize=(16, 8))
heatmap = sns.heatmap(corr_matrix,annot=True,fmt='.1%',vmin=0,vmax=1,mask=mask)
heatmap.set_title('Features Correlating with Disease', fontdict={'fontsize':18}, pad=16);

* Exercise Angina, Chest Pain, Maximum Heart Rate, ST Depression etc are highly realted with Disease.
* Age and gender are also factors which can contribute to heart problems.
* Commonly gussed features which contribute to Disease such as Fasting Blood Sugar & Cholestrol show less impact in this data.
* There is also multi collinearity between features especially ST Seg Slope & ST Deppression which is obvious.

In [None]:
df = data.copy()

About Dataset Feature Refer This [Guide](http://www.kaggle.com/ronitf/heart-disease-uci/discussion/105877).

### Target Feature

In [None]:
df['Disease'] = df['Disease'].astype('category')
df.Disease.cat.categories

In [None]:
df.Disease.cat.categories = ['Yes','No']

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Disease',data=df);

* Data is quite Balanced and we have only 0/1 binary target

## Realtion With Features

### Cholestrol

In [None]:
sns.displot(x='Cholestrol',hue='Disease',data=df,element='step',kde=True);

* The overall trend here is that chances of heart disease increse with level of cholestrol, but at a particular interval people may have disease or not 'cause higher chol level can be because of diet and not fasting before test.

> Values with Cholestrol levels 400+ lets check them. Here also we can see that this level of chol is very high and cause of heat disease but cases also so no disease

In [None]:
df[df['Cholestrol'] > 400]

### ECG

In [None]:
df['Resting ECG'] = df['Resting ECG'].astype('category')
df['Resting ECG'].cat.categories

In [None]:
df['Resting ECG'].cat.categories = ['Prob left ven. hypertrophy','normal','ST-T wave abnormality']

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Resting ECG', hue='Disease', data=df);

### Sex

In [None]:
df.Sex = df.Sex.astype('category')
df.Sex.cat.categories

In [None]:
df.Sex.cat.categories = ['female','male']

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Sex',hue='Disease',data=df);

* Higher Rate of heart disease in male gender.
> Lets see if this trend is cause of class imbalance or some solid reason

In [None]:
mapping = {
    'Age': 'median',
    'Resting BP' : 'median',
    'Cholestrol' : 'median',
    'Resting ECG' : pd.Series.mode,
    'Max HR' : 'median',
    'ST Depression' : 'median',
    'ST Seg Slope' : pd.Series.mode,
    'Ca' : pd.Series.mode,
    'Chest Pain': pd.Series.mode,
    'Exercise Angina' : 'median',
    'Fasting BS' : pd.Series.mode,
    'Th Stress Test' : pd.Series.mode
}

In [None]:
df.groupby('Sex').aggregate(mapping)

In [None]:
print(df.query("Sex == 'male' and `Resting ECG` != 'normal'").shape[0]/df.query("Sex == 'male' and `Resting ECG` == 'normal'").shape[0])
print(df.query("Sex == 'female' and `Resting ECG` != 'normal'").shape[0]/df.query("Sex == 'female' and `Resting ECG` == 'normal'").shape[0])

* Comapring ECG we can se males have greater ratio of abnormal ECG as compared to females

### Exercise Enduced Angina

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Exercise Angina',hue='Disease',data=df);

* If Angina is caused during exercise reslative to rest it is a indicator or heart disease

### Chest Pain

In [None]:
df['Chest Pain'] = df['Chest Pain'].astype('category')
df['Chest Pain'].cat.categories

In [None]:
df['Chest Pain'].cat.categories = ['asymptomatic','atypical angina','non-anginal pain','typical angina']

In [None]:
fig,ax = plt.subplots(nrows=1,ncols=2,sharey=True,figsize=(10,5))
sns.countplot(x='Disease',hue='Chest Pain',data=df,ax=ax[0])
sns.countplot(x='Chest Pain',data=df,ax=ax[1])
ax[1].tick_params(axis="x", rotation=50)

*  Here we can se that most cases are of asymptomatic heart patients, this also because of class dominance to some extent.
*  Thus is can be said that it is difficult to say about heart diseases from patient symptoms.

### Max Heart Rate

In [None]:
f, ax = plt.subplots(1,3,sharey=True,sharex=True,figsize=(18,6))
sns.histplot(x= 'Max HR',data=df,hue='Disease',element='step',ax=ax[0])
sns.histplot(x= df[df['Disease'] == 'Yes']['Max HR'], element='step',ax=ax[1])
sns.histplot(x= df[df['Disease'] == 'No']['Max HR'], element='step',ax=ax[2],color=sns.color_palette("rocket")[-1])
ax[1].legend(['Disease'])
ax[2].legend(['No Disease']);

* It can be unusual to see that higher heart rate causes less heart problems but HR depends on age and these high heart rates are from young peps.

In [None]:
hr_cat = pd.cut(df['Max HR'],bins = [0,160,220])
df.pivot_table(values='Age',index=hr_cat,columns='Disease',aggfunc='median')

* We can see that median age for 160 < Heart Rate < 220 is 48

### ST Depression

In [None]:
sns.displot(x='ST Depression',hue='Disease',data=df);

In [None]:
print(f"Skew Before: {df['ST Depression'].skew()}")
df['ST Depression'] = np.log1p(df['ST Depression'])
print(f"Skew After: {df['ST Depression'].skew()}")

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(x='ST Depression',hue='Disease',data=df,multiple='stack');

* Significant Displacement from ST segment is indicator of Heart Diseses.   

### Age

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(x='Age',hue='Disease',data=df,element='step');

* With increse in Age risk of Heart Related Problems also increse.
* Rise in Heart Diseases in Group 55 - 65 could be due to fact that people with unhealty life style during 30's and 40's may get problems in this period of Age. 

### CA

In [None]:
df.drop(df[df['Ca'] == 4].index,axis=0,inplace=True)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Ca',hue='Disease',data=df);

* Higher count of coloured vessels ie. blocked/narrow greater risk of heart disease.

### Th Stress Test

In [None]:
df.drop(df[df['Th Stress Test'] == 0].index,axis=0,inplace=True)

In [None]:
df['Th Stress Test'] = df['Th Stress Test'].astype('category')
df['Th Stress Test'].cat.categories

In [None]:
df['Th Stress Test'].cat.categories = ['fixed defect','normal','reversable defect']

In [None]:
f,ax = plt.subplots(1,2,figsize=(10,5),sharey=True)
sns.countplot(x='Th Stress Test',data=df,ax=ax[0])
sns.countplot(x='Disease',hue='Th Stress Test',data=df,ax=ax[1])
ax[0].tick_params(axis='x',rotation=30);

*  If Th Test is abnormal greater chances of Heart disease.

### ST Segment Slope

In [None]:
df['ST Seg Slope'] = df['ST Seg Slope'].astype('category')
df['ST Seg Slope'].cat.categories

In [None]:
df['ST Seg Slope'].cat.categories = ['downsloping', 'flat','upsloping']

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='ST Seg Slope',hue='Disease',data=df);

### Collinear Features
> ST Dipression & ST Seg Slope

In [None]:
f,ax = plt.subplots(1,2,figsize=(18,6))
sns.boxplot(y='ST Depression',x='ST Seg Slope',data=df,ax=ax[0])
sns.boxplot(y='ST Depression',x='ST Seg Slope',hue='Disease',data=df,ax=ax[1]);

*  From this we can mark that is a person has Downsloping Slope of Segment than Hear Disease can be decided by using ST depression.
> If Depession < 1.25 person has greater chances of no Heart Disease.

   > If Depession > 1.25 person has greater chances of Heart Disease.

# Feature Engineering

*Nominal Categorical Columns*
* Sex
* Fasting BS
* Slope

*Ordinal Categorical Columns*

* Chest Pain
* Rest ECG
* Th Stress Test
* Ca

In [None]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
data.drop(data.query('Ca == 4 or `Th Stress Test` == 0').index,axis=0,inplace=True)

In [None]:
nominal_pip = make_pipeline(OneHotEncoder())

In [None]:
class ReorderOrdinalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self,ecg=True,th=True):
        self.ecg = ecg
        self.th = th
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        if self.ecg:
            # 'Prob left ven. hypertrophy','normal','ST-T wave abnormality'
            #       0                          1             2
            #       1                          0             2
            X['Resting ECG'] = X['Resting ECG'].replace({0:1,1:0})
        
        if self.th:
            # 'fixed defect','normal','reversable defect'
            #       0           1            2
            #       2           0            1
            X['Th Stress Test'] = X['Th Stress Test'].replace({1:0,0:2,2:1})
        return X

In [None]:
ordianl_pip = make_pipeline(ReorderOrdinalFeatures())

In [None]:
numarical_pip = make_pipeline(StandardScaler())

In [None]:
nominal_att = ['Sex','Fasting BS', 'ST Seg Slope']
ordinal_att = ['Chest Pain','Resting ECG','Th Stress Test','Ca']
numarical_att = ['Age','ST Depression','Max HR','Cholestrol','Resting BP']

In [None]:
full_pipeline = ColumnTransformer([
    ('nom_cat',nominal_pip,nominal_att),
    ('ord_cat',ordianl_pip,ordinal_att),
    ('numarical',numarical_pip,numarical_att)
])

In [None]:
X = data.drop('Disease',axis=1) 
y = data.Disease.replace({"YES":1,'NO':0})

In [None]:
X_prep = full_pipeline.fit_transform(X)

In [None]:
pd.DataFrame(X_prep).head()