# Predicting Heart Disease

## Preparing Tool

In [None]:
# Import all the Tools

#Regular EDA(Exploratory Data Analysis) & Plotting Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#Models from scikit learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Model Evalution
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, plot_roc_curve

## Load Data

In [None]:
df = pd.read_csv("heart-disease.csv")
df

FileNotFoundError: ignored

In [None]:
df.shape

In [None]:
# Data Exploration (Exploratory Data Analysis)
df.tail()

In [None]:
df["target"].value_counts()

In [None]:
df["target"].value_counts().plot(kind="bar");

In [None]:
df.info()

In [None]:
# Are there any missing Values
df.isna().sum()

In [None]:
df.describe()

## Heart Disease Frequency according to Sex

In [None]:
df["sex"].value_counts()

In [None]:
#Compare target column with sex column
pd.crosstab(df["target"], df["sex"])

In [None]:
pd.crosstab(df["target"],df["sex"]).plot(kind="bar", color=["salmon", "lightblue"],figsize=(10,8));
plt.title("Heart Disease with Sex related")
plt.legend(['Female', 'Male'])
plt.xlabel("0 = No Disease, 1 = Disease")
plt.xticks();

In [None]:
df.corr()

In [None]:
df.hist(figsize=(20,10), bins=50, color="red");

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["target","thal","sex","chol","age"]
scatter_matrix(df[attributes], figsize=(20,10),color="black");

### Age vs Max Heart Rate for Heart Disease

In [None]:
# Create another Figure
plt.figure(figsize=(10,8))

#Scatter with Positive Examples
plt.scatter(df.age[df.target==1],df.thalach[df.target==1], c='salmon');

#Scatter with Negative Examples
plt.scatter(df.age[df.target==0], df.thalach[df.target==0], c="Black");

#Add Some Info
plt.title("Heart Disease in Function of Age and Max Heart Rate", fontstyle="italic", color="green")
plt.xlabel("age", fontstyle='oblique')
plt.ylabel("thalch", fontstyle='oblique')
plt.xticks()
plt.legend(["Target=1","Target=0"]);

In [None]:
# Check distribution of age with histogram
df.age.plot.hist();

In [None]:
# Heart Disease frequency per Chest Pain Type
pd.crosstab(df['cp'],df['target'])

In [None]:
#make the crosstab more visual

pd.crosstab(df['cp'],df['target']).plot(kind='bar',figsize=(10,8));
plt.title('Heart Disease per chest pain', fontstyle='italic', color='red')
plt.xlabel('Chest Pain')
plt.ylabel('No. of People')
plt.xticks()
plt.legend(['No-Disease','Disease']);

In [None]:
#Corelation Heatmap

corr_matrix = df.corr()
fig , ax = plt.subplots(figsize=(10,8))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt='.2f')

## Modelling

In [None]:
df.head()

In [None]:
# Split data into train & test set
#fbs and exang
from sklearn.model_selection import StratifiedShuffleSplit
splits = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state=42)
for train_index, test_index in splits.split(df, df['fbs'],df['exang']):
    strat_train_set = df.loc[train_index]
    strat_test_Set = df.loc[test_index]

In [None]:
strat_train_set

In [None]:
strat_test_Set

In [None]:
X_train = strat_train_set.drop('target',axis=1)
X_train

In [None]:
Y_train = strat_train_set['target']
Y_train

In [None]:
X_test = strat_test_Set.drop('target',axis=1)
X_test

In [None]:
Y_test = strat_test_Set['target']
Y_test

In [None]:
#Put models in a divctionary
models= {"Logistic Regression":LogisticRegression(),
        "KNN":KNeighborsClassifier(),
        "Random Forest": RandomForestClassifier()}

#Create a function to fit & score models
def fit_and_score(models, X_train, X_test, Y_train, Y_test):
    """Fit and evalute given Machine learbibg Model"""
    np.random.seed(42)
    model_scores = {}
    
    #Loop through models
    for name, model in models.items():
        #Fit the model to data
        model.fit(X_train,Y_train)
        #Evalute the model and append its score
        model_scores[name]=model.score(X_test,Y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models, X_train, X_test, Y_train, Y_test)

In [None]:
model_scores

In [None]:
model_df = pd.DataFrame(model_scores, index=['accuracy'])
model_df.T.plot(kind='bar', color='red');

## Hyperparameter Tuning

In [None]:
# Tuning KNN

train_Scores = []
test_scores = []

#List of Different values for KNN
neighbours = range(1,20)
KNN = KNeighborsClassifier()

#Loop Through Different n_neighbours
for i in neighbours:
    KNN.set_params(n_neighbors=i)
    
    #Fit the algorithm
    KNN.fit(X_train, Y_train)
    
    #Update the training & testing Scores List
    train_Scores.append(KNN.score(X_train,Y_train))
    test_scores.append(KNN.score(X_test,Y_test))

In [None]:
test_scores

In [None]:
plt.plot(neighbours, train_Scores, label='Train_Scores')
plt.plot(neighbours, test_scores, label='Test_Scores')
plt.xticks(np.arange(1,20,1))
plt.xlabel("Number of Neighbours")
plt.ylabel("Model Score")
plt.legend();

print(f"Max. KNN score {max(test_scores)*100:.2f}%")

*** Randomised Search CV ***

In [None]:
#Create a hyperparameter grid for Logistic Regression
log_reg_grid = {"C" : np.logspace(-4,4,20),
               "solver":['newton-cg','liblinear','saga'],
               "class_weight":[None, 'balanced'],
               'penalty':['l1','l2','elasticnet']}

#Create a hyperparametr grid for RandomForestClassifier
random_grid = {"n_estimators":np.arange(10,1000,100),
              "max_depth":[None,3,5,10],
              "min_samples_split":np.arange(2,20,2),
              "min_samples_leaf":np.arange(1,20,2)}


In [None]:
np.random.seed(42)

#Tune logistic Regression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                               param_distributions=log_reg_grid,
                               cv=5,
                               verbose=True,
                               n_iter=20)

#Tune Random Forest
rs_random_reg = RandomizedSearchCV(RandomForestClassifier(),
                               param_distributions=random_grid,
                               cv=5,
                               verbose=True,
                               n_iter=20)

In [None]:
rs_log_reg.fit(X_train,Y_train)

In [None]:
rs_log_reg

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test,Y_test)

In [None]:
np.random.seed(42)
rs_random_reg.fit(X_train,Y_train)

In [None]:
rs_random_reg.best_params_

In [None]:
rs_random_reg.score(X_test,Y_test)

**Grid Search**

In [None]:
gs_random_reg = GridSearchCV(RandomForestClassifier(),
                            param_grid=random_grid,
                            cv=5,
                            verbose=2)

In [None]:
gs_random_reg.fit(X_train,Y_train)

In [None]:
gs_random_reg.best_params_

In [None]:
gs_random_reg.score(X_test,Y_test)

In [None]:
## Evaluting the best classifier
y_preds = rs_random_reg.predict(X_test)

In [None]:
plot_roc_curve(rs_random_reg,X_test,Y_test);

In [None]:
print(confusion_matrix(Y_test,y_preds))

In [None]:
sns.set(font_scale=1.5)
def plot_confusion(y_test,y_preds):
    #Creating Heatmap of Confusion matrix"
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(confusion_matrix(y_test,y_preds),
                    annot=True,
                    cbar=False)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    
plot_confusion(Y_test,y_preds)

In [None]:
print(classification_report(Y_test,y_preds))

## Calculate Evalution Metrics Using Cross_Validation

In [None]:
clf = RandomForestClassifier(n_estimators= 610,
     min_samples_split= 4,
     min_samples_leaf= 11,
     max_depth= None)

In [None]:
#Cross Validated Accuracy
cv_acc = cross_val_score(clf,X_train,Y_train,cv=5,scoring="accuracy")
accuracy = np.mean(cv_acc)
accuracy

In [None]:
#Cross Validated Precision
cv_acc = cross_val_score(clf,X_train,Y_train,cv=5,scoring="precision")
precison = np.mean(cv_acc)
precison

In [None]:
#Cross Validated Recall
cv_acc = cross_val_score(clf,X_train,Y_train,cv=5,scoring="recall")
recall = np.mean(cv_acc)
recall

In [None]:
#Cross Validated f1_Score
cv_acc = cross_val_score(clf,X_train,Y_train,cv=5,scoring="f1")
f1_score = np.mean(cv_acc)
f1_score

In [None]:
#Visualise our Cross Validated Metrics
cv_metrics = pd.DataFrame({"Accuracy":accuracy,
                          "Precision":precison,
                          "Recall":recall,
                          "f1_score":f1_score},
                         index=[0])

cv_metrics

In [None]:
cv_metrics.T.plot.bar(title="Cross Validated Classification Metrics",legend=False);

## Feature Importance

In [None]:
clf.fit(X_train,Y_train)

In [None]:
clf.feature_importances_

In [None]:
feature_dict = dict(zip(df.columns,list(clf.feature_importances_)))

In [None]:
feature_dict

In [None]:
#Visualise Feature Importances
feature_df = pd.DataFrame(feature_dict,index=[0])
feature_df.T.plot(kind='bar',legend=False,color='black');


In [None]:
from joblib import dump

#Save Model to File
dump(clf, filename="Heart_Disease-Model.joblib")