In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import required library

In [None]:
#reading the dataset

df = pd.read_csv('../input/heart-disease-dataset-uci/HeartDiseaseTrain-Test.csv',index_col=False)
df

# EDA

In [None]:
df.info()

## Initial Observation

- No null data

- A mix of numerical and categorical data

In [None]:
##pairplot for the numerical variables
sns.pairplot(data=df,hue='target')
plt.show()

In [None]:
num = ['age','resting_blood_pressure' ,'cholestoral','Max_heart_rate','oldpeak']

fig,ax=plt.subplots(1,5,figsize=(20,15))
ax=ax.ravel()

for index, col in enumerate(num):
    sns.boxplot(x='target',y=col,data=df, ax=ax[index])

## Numerical Values


### From Pairplot
- Seems that there are pretty obvious difference when it comes to heart rates and target -> *higher heart rate seems to have more heart attack*

- Seems for oldpeak, most heart disease that occur are concentrated around 0

- Other factors do not seem to provide much explanatory values

### From boxplots
- Older age median surprisingly results in less heart disease

- Low oldpeak seems to be more common with heart disease

- heart disease seems more common with higher max heart rate

- No significance difference when it comes to resting blood pressure and chloesteral

### Going forward

- Some insights maybe derived if we split blood pressure and chloesteral into high, medium, low according to some medical guidelines

## Categorical Values

- Looking at the paiplot itself, the numerical variables do not seem to explain much and nothing except oldpeak and max heart rate

- Let's have a look at categorical values

In [None]:
df.info()

In [None]:
cat = ['sex','chest_pain_type','fasting_blood_sugar','rest_ecg', 'exercise_induced_angina','slope','vessels_colored_by_flourosopy','thalassemia']
df[cat]

In [None]:
for col in cat:
    print(f'For {col}, the unique values are: {df[col].unique()}')
    print('\n')
    
#to see each unique value in each categorical variable

In [None]:
fig,ax=plt.subplots(4,2,figsize=(20,15))
ax=ax.ravel()

for index, col in enumerate(cat):
    sns.countplot(x=col,hue='target',data=df,ax=ax[index])

## Insights from categorical values

- Female seems more at risk of heart disease

- Slight increased risk of heart disease when fasting blood sugar is lower than 120mg/ml

- Increased risk of heart disease when exercise induced angina is not present

- Highest risk of heart disease when vessels colored by flourosopy is zero

- Typical angina has the lowest risk of heart disease

- ST-T wave abnormality presents the highest heart disease risk

- Downsloping and flat represents the highest and lowest risk of heart disease respectively

- Fixed defect and reverseable defect of thalassemia presents the highest and lowest risk of heart disease respectively

## Advanced EDA

- See if we can extract more insights from some of the numerical variables when we categorize them as low, medium and high

In [None]:
sns.displot(data=df,x='cholestoral',kde=True)
plt.show()

## Cholesterol Reseach

### According to https://www.medicalnewstoday.com/articles/315900:


*Cholesterol levels for adults*

- *Total cholesterol levels less than 200 milligrams per deciliter (mg/dL) are considered desirable for adults. A reading between 200 and 239 mg/dL is considered borderline high and a reading of 240 mg/dL and above is considered high.*

Assuming the column is indeed referring to total cholesterol levels.  Let's try to explore this in relation to heart disease

In [None]:
def chol_level(x):
    if x > 240:
        return 2 #high
    elif x > 200:
        return 1 #medium (borderline high)
    else:
        return 0 #desirable

In [None]:
df['cholestoral_level'] = df['cholestoral'].apply(chol_level)
df

In [None]:
df[['cholestoral_level','cholestoral']]

#looks like we have what we want

In [None]:
sns.countplot(x='cholestoral_level',hue='target',data=df)
plt.title('Breakdown by cholestoral level')
plt.show()

Seems that higher cholestoral does make you less likely to have heart disease

## Blood pressure research

### According to https://www.cdc.gov/bloodpressure/about.htm:

- Normal 	systolic: less than 120 mm Hg

- At Risk (prehypertension) 	systolic: 120–139 mm Hg	

- High Blood Pressure (hypertension) 	systolic: 140 mm Hg or higher

In [None]:
sns.displot(data=df,x='resting_blood_pressure',kde=True)
plt.show()

In [None]:
df['resting_blood_pressure'].describe()

In [None]:
def get_blood_level(x):
    if x > 140:
        return 2 #high
    elif x > 120:
        return 1 #at risk
    else:
        return 0 #normal

In [None]:
df['blood_pressure_level'] = df['resting_blood_pressure'].apply(get_blood_level)
df[['blood_pressure_level','resting_blood_pressure']]

In [None]:
sns.countplot(x='blood_pressure_level',hue='target',data=df)
plt.title('Breakdown by blood pressure level')
plt.show()

Interestingly, it seems that for those with low blood pressure, there're more occurence of heart disease proportionally than high blood pressure levels

# Classification Problem

There are some some classification models we can consider using:

- Ensemble of trees (RandomForestClassifier).  From our EDA, there does not seem to be a strong trend among each of the variables.  D.Tree may help us discover some of the rules.

- Linear SVC

- kNearestNeighbor

## Workflow

For each of the model, we will split the data into 70/15/15  for train, test, validation.  (as per stated in task).  We will use predict_proba and use a custom function to determine which probability threshold to classify.  My major scoring would be accuracy while keeping in mind of recall (since we want to capture all those at risk of heart disease)

In [None]:
#import required library

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.preprocessing import LabelEncoder #for kNN
from sklearn.preprocessing import StandardScaler #for kNN
from sklearn.preprocessing import MinMaxScaler #for SVC

## kNN

In [None]:
cat

In [None]:
num

In [None]:
le = LabelEncoder()
scaler = StandardScaler()

In [None]:
df_knn = df.copy()

In [None]:
scaler.fit(df_knn[num].values)
df_knn[num] = scaler.transform(df_knn[num].values)

#scaled numerical values

In [None]:
for col in cat:  
    le.fit(df_knn[col].values)
    df_knn[col] = le.transform(df_knn[col].values)

In [None]:
X_features = ['age',
              'sex',
              'chest_pain_type',
              'resting_blood_pressure',
              'cholestoral',
              'fasting_blood_sugar',
              'rest_ecg',
              'Max_heart_rate',
              'exercise_induced_angina',
              'oldpeak',
              'slope',
              'vessels_colored_by_flourosopy',
              'thalassemia']

y_outcome = 'target'

X = df_knn[X_features]
y = df_knn[y_outcome]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_test, X_validate, y_test, y_validate = train_test_split(X_test,y_test,test_size=0.5,random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_validate.shape)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f'Default parameter accuracy score {accuracy_score(y_test, y_pred)}')
print(f'\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}')

In [None]:
#hyperparameter tuning
knn = KNeighborsClassifier()

param = {'n_neighbors':[1,2,3,6,9,12,15],
        }

cv = GridSearchCV(estimator=knn, param_grid=param, scoring='accuracy',
                  verbose=5,cv=5,n_jobs=-1)

cv.fit(X_train, y_train)


In [None]:
print(cv.best_params_)
print(cv.best_score_)

In [None]:
y_pred = cv.predict(X_validate)
print(f'Tuned parameter accuracy score {accuracy_score(y_validate, y_pred)}')
print(f'\nConfusion Matrix:\n{confusion_matrix(y_validate,y_pred)}')

With n_neighbor = 1, the model yields the best results of 96.7% accuracy.  Looking at the confusion matrix, there are no false negative but false positives.  Let's see what some more advanced models presents us

## Ensemble trees

Tree models work well without scaling so we can just simply split our data and fit our model but it may still more appropriate to encode the categorical variables

In [None]:
cat

In [None]:
df_rf = df.copy()
df_rf

In [None]:
for col in cat:  
    le.fit(df_rf[col].values)
    df_rf[col] = le.transform(df_rf[col].values)

In [None]:
df_rf

In [None]:
X_features = ['age',
              'sex',
              'chest_pain_type',
              'resting_blood_pressure',
              'cholestoral',
              'fasting_blood_sugar',
              'rest_ecg',
              'Max_heart_rate',
              'exercise_induced_angina',
              'oldpeak',
              'slope',
              'vessels_colored_by_flourosopy',
              'thalassemia']

y_outcome = 'target'

X = df_rf[X_features]
y = df_rf[y_outcome]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_test, X_validate, y_test, y_validate = train_test_split(X_test,y_test,test_size=0.5,random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=50,
                            max_features=round(np.sqrt(len(X_features))))

In [None]:
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(f'Default parameter accuracy score {accuracy_score(y_test, y_pred)}')
print(f'\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}')

Even without tuning much, our RandomForestRegressor beats the kNN model

In [None]:
#hyperparameter tuning
knn = RandomForestClassifier()

param = {'n_estimators':[50,75,100,125],
         'max_features':[3,4,5,6,8,10],
         'max_depth':[5,10,15,20,25,30]
        }

cv = GridSearchCV(estimator=knn, param_grid=param, scoring='accuracy',
                  verbose=5,cv=5,n_jobs=-1)

cv.fit(X_train, y_train)

In [None]:
print(cv.best_params_)
print(cv.best_score_)

In [None]:
y_pred = cv.predict(X_validate)
print(f'Tuned parameter accuracy score {accuracy_score(y_validate, y_pred)}')
print(f'\nConfusion Matrix:\n{confusion_matrix(y_validate,y_pred)}')

In [None]:
y_pred = cv.predict_proba(X_validate)
y_pred_1_proba = y_pred[:,1]
y_pred_1_proba

## Probability Threshold adjustment

The above is our baseline results with the best parameteres, maybe we can use predict_proba and some function to output a class prediction according to some threshold to futher improve the scoring

In [None]:
#function to get class based on probability
def get_y_pred(p, proba_pred):
    y_pred = []
    for i in range(0,len(proba_pred)):
        if proba_pred[i] > p:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return pd.Series(y_pred)

In [None]:
prob = [0.1,0.3,0.4,0.6,0.75,0.8,0.875]

for p in prob:
    y_pred = get_y_pred(p,y_pred_1_proba)
    print(f'At P = {p}, Accuracy score = {accuracy_score(y_validate,y_pred)}')
    print(confusion_matrix(y_validate,y_pred))
    print('\n')

## 100% accuracy
At 0.3 probability, we get 100% accuracy

## Feature Importance

In [None]:
rf = RandomForestClassifier(max_depth= 15, 
                            max_features= 3, 
                            n_estimators= 125) #best parameters

rf.fit(X_train,y_train)
plt.barh(X_features,rf.feature_importances_)
plt.show()

Which seems to confirm what we suspected during the EDA, most high importance feature are of categorical nature, with the exception of oldpeak and max heart rate

## LinearSVC

In [None]:
svc = LinearSVC()

In [None]:
df_svc = df.copy()

We used LabelEncoder before, now let's try dummies since SVC is more sensitive to the scale of the feature values

In [None]:
dum_df = pd.get_dummies(df_svc,columns=cat)
dum_df

In [None]:
X_features = list(dum_df.columns)[8:] #saving dummies into new X_features

In [None]:
for col in num:
    X_features.append(col)

X_features #for svc

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(dum_df[num].values)
dum_df[num] = scaler.transform(dum_df[num].values)

In [None]:
dum_df[X_features]

In [None]:
X = dum_df[X_features]
y = dum_df[y_outcome]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_test, X_validate, y_test, y_validate = train_test_split(X_test,y_test,test_size=0.5,random_state=42)

In [None]:
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print(f'Default parameter accuracy score {accuracy_score(y_test, y_pred)}')
print(f'\nConfusion Matrix:\n{confusion_matrix(y_test,y_pred)}')

In [None]:
#hyperparameter tuning

svc = LinearSVC()

param = {
    'C':[0.1,1,10,100,1000],
    'max_iter':[1000,2000]
}

cv = GridSearchCV(estimator=svc, param_grid=param, scoring='accuracy',
                  verbose=5,cv=5,n_jobs=-1)

cv.fit(X_train, y_train)

In [None]:
print(cv.best_params_)
print(cv.best_score_)

In [None]:
y_pred = cv.predict(X_validate)
print(f'Tuned parameter accuracy score {accuracy_score(y_validate, y_pred)}')
print(f'\nConfusion Matrix:\n{confusion_matrix(y_validate,y_pred)}')

Based on the warning and the results, it seems that SVC does not work very well with our problem

## Conclusion

- We have done some basic and advanced EDA to enhance our understanding of the problem

- We have tried various models, with the most success with the RandomForestClassifier, reaching 100% accuracy on validation