In [None]:
# Import Libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import time

In [None]:
# Import the data
df = pd.read_csv('heart.csv')

In [None]:
df.head(5)

In [None]:
# Preliminary Analysis
#1. Dataset information
df.info()

In [None]:
#2. Data description
df.describe()

In [None]:
#3. Find Null values
df.isnull().sum()

In [None]:
#4. Check for duplicated entries
df.duplicated().sum() 

In [None]:
## Segregating Columns into groups
#Numerical -> Age, RestingBP, Cholesterol, MaxHR, OldPeak, HeartDisease, FastingBS
#Categorical -> Sex, ChestPainType, RestingECG, ExerciseAngina, ST_Slope
df.dtypes

In [None]:
# 1. Analysis on Age
df['Age'].describe()

In [None]:
# Plot
fig, ax = plt.subplots(1,2, figsize=(12,5))

ax[0] = sns.histplot(data=df, x='Age', linewidth=1, edgecolor='black', ax=ax[0], color='#f7d794')
ax[0].set(title='Age Distribution - Histogram')

ax[1] = sns.kdeplot(data=df, x='Age', fill=True, edgecolor='black', ax=ax[1], color='#e15f41')
ax[1] = sns.rugplot(data=df, x='Age', ax=ax[1], color='#574b90')
ax[1].set(title='Age Distribution - KDE Plot')

plt.show()

In [None]:
# Age Skewness
df['Age'].skew()

In [None]:
## Checking For Outliers
fig, ax = plt.subplots(figsize=(10, 3))
sns.boxplot(data=df, x='Age', color='#7ed6df')
sns.stripplot(data=df, x='Age', linewidth = 0.6, size=3, color='#be2edd')
plt.title('Age Distribution',fontsize= 14)
plt.tight_layout()
plt.show()


In [None]:
#####Conclusions:
#1.No missing values
#2.Data is almost normal
#3.No outliers

In [None]:
# 2. Analysis on RestingBP
df['RestingBP'].describe()

In [None]:
# Plot
fig, ax = plt.subplots(1,2, figsize=(12,5))

ax[0] = sns.histplot(data=df, x='RestingBP', linewidth=1, edgecolor='black', ax=ax[0], color='#D980FA')
ax[0].set(title='Resting Blood Pressure Distribution - Histogram')

ax[1] = sns.kdeplot(data=df, x='RestingBP', fill=True, edgecolor='black', ax=ax[1], color='#9980FA')
ax[1] = sns.rugplot(data=df, x='RestingBP', ax=ax[1], color='#574b90')
ax[1].set(title='Resting Blood Pressure Distribution - KDE Plot')

plt.show()

In [None]:
# RestingBP Skewness
df['RestingBP'].skew()

In [None]:
## Checking For Outliers
fig, ax = plt.subplots(figsize=(15, 3))
sns.boxplot(data=df, x='RestingBP', color='#ED4C67')
sns.stripplot(data=df, x='RestingBP', linewidth = 0.6, size=3, color='#833471')
plt.title('Resting Blood Pressure Distribution',fontsize= 14)
plt.tight_layout()
plt.show()

In [None]:
#####Conclusions:
#1.No missing values
#2.Almost normal distribution
#3.There are some outliers

In [None]:
# 3. Analysis on Cholesterol
df['Cholesterol'].describe()

In [None]:
# Plot
fig, ax = plt.subplots(1,2, figsize=(12,5))

ax[0] = sns.histplot(data=df, x='Cholesterol', linewidth=1, edgecolor='black', ax=ax[0], color='#EA2027')
ax[0].set(title='Cholesterol Distribution - Histogram')

ax[1] = sns.kdeplot(data=df, x='Cholesterol', fill=True, edgecolor='black', ax=ax[1], color='#F79F1F')
ax[1] = sns.rugplot(data=df, x='Cholesterol', ax=ax[1], color='#FFC312')
ax[1].set(title='Cholesterol Distribution - KDE Plot')

plt.show()


In [None]:
# Cholesterol skewness
df['Cholesterol'].skew()

In [None]:
## Checking For Outliers
fig, ax = plt.subplots(figsize=(15, 3))
sns.boxplot(data=df, x='Cholesterol', color='#C4E538')
sns.stripplot(data=df, x='Cholesterol', linewidth = 0.6, size=3, color='#009432')
plt.title('Cholestrol Distribution',fontsize= 14)
plt.tight_layout()
plt.show()

In [None]:
## Exploring the outliers
df[(df['Cholesterol'] == 0) & df['HeartDisease'] == 1]
df[(df['Cholesterol'] > 450)]

In [None]:
#####Conclusions:
#No missing values
#This follows a Bimodal distribution
#Often, high cholesterol is related to risk of heart disease. But in our data, we can see that even people having 0 cholesterol (with other normal parameters) are prone to heart disease _ could be an error in entering values
#On the other side of the spectrum, values above 400, are high but they are not outliers cause the range goes as high as 800. And these people also have a risk of heart disease with high cholesterol. So, this seems kind of legit

In [None]:
# 4. Analysis on Max Heart Rate
df['MaxHR'].describe()

In [None]:
# Plot
fig, ax = plt.subplots(1,2, figsize=(12,5))

ax[0] = sns.histplot(data=df, x='MaxHR', linewidth=1, edgecolor='black', ax=ax[0], color='#00a8ff')
ax[0].set(title='max Heart Rate Distribution - Histogram')

ax[1] = sns.kdeplot(data=df, x='MaxHR', fill=True, edgecolor='black', ax=ax[1], color='#9c88ff')
# ax[1] = sns.rugplot(data=df, x='MaxHR', ax=ax[1], color='#487eb0')
ax[1].set(title='max Heart Rate Distribution - KDE Plot')

plt.show()

In [None]:
# MaxHR Skewness
df['MaxHR'].skew()

In [None]:
## Checking For Outliers
fig, ax = plt.subplots(figsize=(15, 3))
sns.boxplot(data=df, x='MaxHR', color='#52b69a')
sns.stripplot(data=df, x='MaxHR', linewidth = 0.6, size=3, color='#d9ed92')
plt.title('Max Heart Rate Distribution',fontsize= 14)
plt.tight_layout()
plt.show()

In [None]:
# Exploring the outliers
df.query("MaxHR < 70")

In [None]:
###Conclusions:
#1.No null values
#2.Almost normally distributed
#3.There are handful outliers
#4.Most people have a max heart rate of 150, which seems okay as most of the people in this dataset are aged between 50 - 65

In [None]:
####Univariate Analysis on Categorical 

## Analysis on Heart Disease
fig, ax = plt.subplots(figsize=(6, 4.5))
ax = sns.barplot(data=df['HeartDisease'].value_counts().reset_index(), x='index', y='HeartDisease', 
           linewidth=1, edgecolor='black', palette='Set3', ax=ax)
ax.set(title='Count of Heart Disease Vs No Heart Disease', ylabel='Count', xlabel='Heart Disease')
plt.tight_layout()
plt.show()

In [None]:
##Conclusions:
# A higher number of patients in the dataset have risk of suffering from a heart disease

In [None]:
## Analysis on Sex
fig, ax = plt.subplots(figsize=(6, 4.5))
ax = sns.barplot(data=df['Sex'].value_counts().reset_index(), x='index', y='Sex', 
           linewidth=1, edgecolor='black', palette='Set2', ax=ax)
ax.set(title='Count of Male Vs Female', ylabel='Count', xlabel='Sex')
plt.tight_layout()
plt.show()

In [None]:
####Conclusions:
# The dataset predominantly consists of records from males, outnumbering female records.
# The male records significantly outnumber the female records in this dataset, with a ratio of more than three to one.
# It is important to be mindful of this gender imbalance in our dataset to avoid introducing bias when making assumptions or drawing conclusions.

In [None]:
## Analysis on Chest Pain Type

### ASY : ASY = "Asymptomatic" chest pain. It indicates that the individuals in the dataset did not experience any noticeable chest pain symptoms. 
#They may have been included in the dataset for other reasons, such as a medical examination or routine check-up.

### NAP : NAP = "Non-Anginal Pain" chest pain. Non-anginal pain refers to chest discomfort or pain that is not related to a reduced blood supply to the heart. 
#It is typically not caused by underlying heart disease but may still be a cause of concern and require further evaluation.

### ATA : ATA = "Atypical Angina" chest pain. Atypical angina refers to chest pain that does not fit the typical pattern of symptoms associated with angina. 
#It may have different characteristics or be triggered by factors other than physical exertion or emotional stress.

### TA : TA stands for "Typical Angina" chest pain. Typical angina refers to chest pain that follows a predictable pattern and is commonly associated with coronary artery disease. 
#It is typically described as a squeezing or pressure-like sensation in the chest that is brought on by physical exertion or emotional stress and is relieved with rest or nitroglycerin medication.

# Plot
fig, ax = plt.subplots(figsize=(6, 4.5))
ax = sns.barplot(data=df['ChestPainType'].value_counts().reset_index(), x='index', y='ChestPainType', 
           linewidth=1, edgecolor='black', palette='Set1', ax=ax)
ax.set(title='Count of Chest Pain Types', ylabel='Count', xlabel='Chest Pain Type')
plt.tight_layout()
plt.show()

In [None]:
#### Conclusions:
#The majority of individuals in the dataset did not report any noticeable symptoms of chest pain, this suggests that noticeable chest pain may not be the primary symptom associated with heart disease.
#Can be sure of this when we do the bivarite analysis of Chest Pain Type and the Heart Disease Column

In [None]:
## BIVARIATE ANALYSIS

#Bivariate analysis is the examination of the relationship between two variables. Here are three key points about bivariate analysis:

#1. Relationship Exploration: Bivariate analysis helps uncover patterns, associations, or connections between two variables. 
#It allows us to understand how changes in one variable correspond to changes in another variable.

#2. Correlation Assessment: Bivariate analysis helps us assess the strength and direction of the relationship between two variables. 
#It enables us to determine if the variables are positively, negatively, or not correlated at all.

#3. Visual Representation: Bivariate analysis often involves the use of visualizations such as scatter plots, line graphs, or heatmaps. 
#These visual representations provide a clear and concise way to understand the relationship between the two variable

In [None]:
## Analysis between Heart Disease and Age:
# Which age group is most prone to heart disease?
plt.figure(figsize=(6, 4))
sns.kdeplot(data = df[df['HeartDisease'] == 1], x='Age', fill=True,   color='#0683c9', label='No Heart Disease')
sns.kdeplot(data = df[df['HeartDisease'] == 0], x='Age', fill=True,  color='#f5e840', label='Has Heart Disease')
plt.legend()
plt.title('Age and Heart Disease')
plt.tight_layout()
plt.show()

In [None]:
#####Conclusions:
#1. Individuals under the age of 50 have a lower likelihood of experiencing heart disease.
#2. Once individuals reach the age of 50, the risk of developing heart disease significantly increases.
#3. The age group between 50 and 55 shows the highest susceptibility to heart disease.

In [None]:
## Analysis between Heart Disease and Gender:
# Which gender is most prone to heart disease?
temp_df = pd.crosstab(df['HeartDisease'], df['Sex'])

fig, ax = plt.subplots(1,2, figsize=(20,7))

ax[0] = sns.heatmap(data=temp_df, annot=True, ax=ax[0],fmt='0g', linewidths=0.01, cmap='summer_r', linecolor='#95d5b2')
ax[0].set(title='Heatmap of Gender Vs Heart Disease', ylabel='Heart Disease')

ax[1] = sns.countplot(data=df, x="Sex", hue="HeartDisease", palette = 'Set2', ax=ax[1], edgecolor='black')
ax[1].set(title='Countplot of Gender Vs Heart Disease', ylabel='Count')

plt.tight_layout()
plt.show()

In [None]:
#### Conclusions:
#1. The occurrence of heart disease is more prevalent among males compared to females.
#2. The dataset contains significantly more data for males than females, almost three times as much.
#3. However, when considering the percentages, approximately 25% of females in the dataset have heart disease, while a higher percentage of around 63% of males have heart disease.
# These observations highlight the gender disparity in the prevalence of heart disease, with a higher proportion of males being affected.

In [None]:
## Analysis between Heart Disease and Cholesterol:
# Does Cholesterol play a role in heart disease?

plt.figure(figsize=(6, 4))
sns.kdeplot(data = df[df['HeartDisease'] == 1], x='Cholesterol', fill=True,   color='#b5e48c', label='No Heart Disease')
sns.kdeplot(data = df[df['HeartDisease'] == 0], x='Cholesterol', fill=True,  color='#CC1011', label='Has Heart Disease')
plt.legend()
plt.title('Age and Heart Disease')
plt.tight_layout()
plt.show()

In [None]:
### Conclusions:
#Cholesterol Ranges
## Normal: less than 150 mg/dL.
## Mild hypertriglyceridemia: 150 to 499 mg/dL.
## Moderate hypertriglyceridemia: 500 to 886 mg/dL.
## Very high or severe hypertriglyceridemia: greater than 886 mg/dL.


# The analysis reveals that individuals with cholesterol levels below 150 mg/dL are generally considered healthy.
# There are still a significant number of healthy individuals above this range, the number of individuals prone to heart disease is also substantially higher.
# These findings suggest that higher cholesterol levels do indeed pose an increased risk of heart disease.
## These observations support the notion that elevated cholesterol levels contribute to a higher likelihood of developing heart disease.

In [None]:
## Analysis between Heart Disease and Chest Pain Type:
# What does Chest Pain Type tell us about Heart Disease?
temp_df = pd.crosstab(df['HeartDisease'], df['ChestPainType'])
temp_df

In [None]:
# Plot
fig, ax = plt.subplots(1,2, figsize=(20,7))

ax[0] = sns.heatmap(data=temp_df,annot=True,fmt='0g', ax = ax[0],linewidths=0.01, cmap='mako_r', linecolor='#cae9ff')
ax[0].set(title='Heatmap of Chest Pain Type Vs Heart Disease', ylabel='Heart Disease')

ax[1] = sns.countplot(data=df, x="ChestPainType", hue="HeartDisease", palette = 'Set2', ax=ax[1], edgecolor='black')
ax[1].set(title='Countplot of Chest Pain Type Vs Heart Disease', ylabel='Count')

plt.tight_layout()
plt.show()

In [None]:
###### Conclusions:
#1. A significant proportion of people who have heart disease do not experience any chest pain symptoms. This type of chest pain is called Asymptomatic Chest Pain.
#2. The next largest group of people with heart disease experience Atypical Angina, which means they may have chest pain or discomfort that differs from the typical chest pain associated with heart disease.
# These findings suggest that heart disease can manifest without the presence of chest pain. It implies that not all cases of heart disease are accompanied by chest pains, and individuals may have heart disease without experiencing typical chest pain symptoms

In [None]:
## Analysis between Heart Disease and Exercise Angina:
# Can exercise cause heart disease?

# Exercise Angina, also known as exertional angina or angina pectoris, refers to chest pain or discomfort that occurs during physical activity or exercise. 
# It is a symptom typically associated with coronary artery disease, which is a condition where the arteries that supply blood to the heart become narrowed or blocked.

temp_df = pd.crosstab(df['HeartDisease'], df['ExerciseAngina'])

In [None]:
# Plot
fig, ax = plt.subplots(1,2, figsize=(20,7))

ax[0] = sns.heatmap(data=temp_df,annot=True,fmt='0g', ax = ax[0],linewidths=0.01, cmap='mako_r', linecolor='#cae9ff')
ax[0].set(title='Heatmap of Exercise Angina Vs Heart Disease', ylabel='Heart Disease')

ax[1] = sns.countplot(data=df, x="ExerciseAngina", hue="HeartDisease", palette = 'Set2', ax=ax[1], edgecolor='black')
ax[1].set(title='Countplot of Exercise Angina Vs Heart Disease', ylabel='Count')

plt.tight_layout()
plt.show()

In [None]:
## Analysis between Heart Disease and Fasting Blood Sugar:
# How is Fasting Blood Sugar related to Heart Disease?

# What is Fasting Blood Sugar? - Fasting blood sugar, also known as fasting blood glucose, refers to the level of glucose (sugar) in the bloodstream after an overnight fast. 
#It is typically measured in milligrams per deciliter (mg/dL) or millimoles per liter (mmol/L).
#Fasting blood sugar is an important parameter used in diagnosing and monitoring diabetes.
#It provides information about the body's ability to regulate blood glucose levels in the absence of food intake.
#Normally, when a person has fasted for at least 8 hours, the fasting blood sugar level should fall within a specific range.

##Note
#In this dataset 0 means normal Fasting Blood Sugar level and 1 means Abnormal Blood Sugar level

temp_df = pd.crosstab(df['HeartDisease'], df['FastingBS'])
temp_df

In [None]:
# Plot
fig, ax = plt.subplots(figsize= (6,4))

ax = sns.heatmap(data=temp_df,annot=True,fmt='0g', ax = ax,linewidths=0.01, cmap='summer_r', linecolor='#cae9ff')
ax.set(title='Heatmap of Fasting Blood Sugar Vs Heart Disease', ylabel='Heart Disease', xlabel='Fasting Blood Sugar')
plt.tight_layout()
plt.show()

In [None]:
#####Conclusions:
# Based on the heatmap analysis, it is evident that individuals with normal fasting blood sugar levels are also susceptible to heart disease. 
# In fact, they exhibit a higher likelihood of developing heart disease compared to those with abnormal fasting blood sugar levels.
# This suggests that while fasting blood sugar levels can be a factor in the development or presence of heart disease, it is not the sole determining factor

In [None]:
df.columns

In [None]:
###Data Preprocessing 

## Label encoding of categorical features
from sklearn.preprocessing import LabelEncoder
features = ['Sex','ChestPainType','RestingECG','ExerciseAngina', 'ST_Slope']
le = LabelEncoder()
for feature in features:
    le.fit(df[feature].unique())
    df[feature] = le.transform(df[feature])
    print(feature, df[feature].unique())


In [None]:
df.dtypes

In [None]:
## Standardization
# Standardize the continuous variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age', 'RestingBP', 'Cholesterol','MaxHR','Oldpeak']] = scaler.fit_transform(df[['Age', 'RestingBP', 'Cholesterol','MaxHR','Oldpeak']])

df.head(3)


In [None]:
## Separate dataset into train_test dataset
from sklearn import model_selection
y = df['HeartDisease']
X = df.drop('HeartDisease', axis = 1)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
print('training data has ' + str(X_train.shape[0]) + ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + ' observation with ' + str(X_test.shape[1]) + ' features')


In [None]:
#### Model Training & Evaluation
## Build models
# Models for the project
from sklearn. ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

# For confusion matrix
from sklearn import metrics, model_selection

# Logistic Regression
classifier_logistic = LogisticRegression()

# K Nearest Neighbors
classifier_KNN = KNeighborsClassifier()

# Random Forest
classifier_RF = RandomForestClassifier()

# Support Vector Classification
classifier_SVC = SVC(probability=True)

# GB classifier
classifier_GB = GradientBoostingClassifier()

# Gaussian Naive Bayes
classifier_NB = GaussianNB()


In [None]:
## Logistic Regressional Classifier and evaluation
classifier_logistic.fit(X_train, y_train) # Train Model
y_predict = classifier_logistic.predict(X_train) # Predict results

# Cross validation
scores = model_selection.cross_val_score(classifier_logistic, X_train, y_train, cv = 10)
print(f' For Logistic Regressional Classifier, the acc is {round(scores.mean()* 100, 2)} \
      ({round(scores.mean() * 100 - scores.std() * 100 * 1.96, 2)}\
      {round(scores.mean() * 100, 2) + round (scores.std() * 100 * 1.96, 2)} %')

# Confusion Matrix
cm = metrics.confusion_matrix(y_train, y_predict)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show

print(metrics.classification_report(y_train, y_predict))

In [None]:
## KNN Classifier
classifier_KNN.fit(X_train, y_train) # Train Model
y_predict = classifier_KNN.predict(X_train) # Predict results

# Cross Validation
scores = model_selection.cross_val_score(classifier_KNN, X_train, y_train, cv = 10)
print(f' For KNN, the acc is {round(scores.mean() * 100, 2)}\
      ({round(scores.mean() * 100 - scores.std() * 100 * 1.96, 2)}\
      {round(scores.mean() * 100, 2) + round(scores.std() * 100 *1.96, 2)}) %')

# Confusion Matrix
cm = metrics.confusion_matrix(y_train, y_predict)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(metrics.classification_report(y_train, y_predict))

In [None]:
## Random Forest
classifier_RF.fit(X_train, y_train) # Train Model
y_predict = classifier_RF.predict(X_train) # Predict results

# Cross validation
scores = model_selection.cross_val_score(classifier_RF, X_train, y_train, cv = 10)
print(f'For RF, the acc is {round(scores.mean() * 100, 2)}\
     ({round(scores.mean() * 100 - scores.std() * 100 * 1.96, 2)}\
      {round(scores.mean() * 100, 2) + round(scores.std() *100 * 1.96, 2)}) %')


# Confusion Matrix
cm = metrics.confusion_matrix(y_train, y_predict)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(metrics.classification_report(y_train, y_predict))

In [None]:
## SVC
classifier_SVC.fit(X_train, y_train) # Train Model
y_predict = classifier_SVC.predict(X_train) # Predict results

# Cross validation
scores = model_selection.cross_val_score(classifier_SVC, X_train, y_train, cv = 10)
print(f'For SVC, the acc is {round(scores.mean() * 100, 2)}\
     ({round(scores.mean() * 100 - scores.std() * 100 * 1.96, 2)}\
      {round(scores.mean() * 100, 2) + round(scores.std() *100 * 1.96, 2)}) %')


# Confusion Matrix
cm = metrics.confusion_matrix(y_train, y_predict)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(metrics.classification_report(y_train, y_predict))

In [None]:
## GB Classifier
classifier_GB.fit(X_train, y_train) # Train Model
y_predict = classifier_GB.predict(X_train) # Predict results

# Cross validation
scores = model_selection.cross_val_score(classifier_GB, X_train, y_train, cv = 10)
print(f'For GB Classifier, the acc is {round(scores.mean() * 100, 2)}\
     ({round(scores.mean() * 100 - scores.std() * 100 * 1.96, 2)}\
      {round(scores.mean() * 100, 2) + round(scores.std() *100 * 1.96, 2)}) %')


# Confusion Matrix
cm = metrics.confusion_matrix(y_train, y_predict)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(metrics.classification_report(y_train, y_predict))

In [None]:
## Naive Bayes
classifier_NB.fit(X_train, y_train, sample_weight=None) # Train Model
y_predict = classifier_NB.predict(X_train) # Predict results

# Cross validation
scores = model_selection.cross_val_score(classifier_NB, X_train, y_train, cv = 10)
print(f'For Naive Bayes Classifier, the acc is {round(scores.mean() * 100, 2)}\
     ({round(scores.mean() * 100 - scores.std() * 100 * 1.96, 2)}\
      {round(scores.mean() * 100, 2) + round(scores.std() *100 * 1.96, 2)}) %')


# Confusion Matrix
cm = metrics.confusion_matrix(y_train, y_predict)
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(metrics.classification_report(y_train, y_predict))

In [None]:
## Optimize Hyperparameters
from sklearn.model_selection import GridSearchCV

# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(best_parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [None]:
# Model 1 - Logistic Regression
parameters = {
    'penalty':('l2','l1'),
    'C': (0.036, 0.037, 0.038, 0.039, 0.040, 0.041, 0.042)
    
}
Grid_LR = GridSearchCV(LogisticRegression(solver='liblinear'),parameters, cv = 10)
Grid_LR.fit(X_train, y_train)
# the best hyperparameter combination
# C = 1/lambda
print_grid_search_metrics(Grid_LR)

In [None]:
best_LR_model = Grid_LR.best_estimator_

best_LR_model.predict(X_test)

print('The test acc of the "best" model for LR is', best_LR_model.score(X_test, y_test))

In [None]:
# Model 2 _ KNN Model

# timing
start = time.time()
# Choose k and more
parameters = {
 'n_neighbors':[7,8,9,10,11,12,13,14,15],
 'weights':['uniform', 'distance'],
 'leaf_size':[1,2,3,4,5,6,7],
}
Grid_KNN = GridSearchCV(KNeighborsClassifier(),parameters, cv=10)
Grid_KNN.fit(X_train, y_train)
# the best hyperparameter combination
print_grid_search_metrics(Grid_KNN) 
end = time.time()
print(f'For KNN, it took {(end - start)/(9 * 2 * 7)} seconds per parameter attempt')

In [None]:
best_KNN_model = Grid_KNN.best_estimator_

best_KNN_model.predict(X_test)

print('The test acc of the "best" model for KNN is', best_KNN_model.score(X_test, y_test))

In [None]:
## Model 3 - RF
# timing
start = time.time()
# Possible hyperparamter options for Random Forest
# Choose the number of trees
parameters = {
 'n_estimators' : [65, 64, 63, 62, 61, 60],
 'max_depth': [8,9,10,11]
}
Grid_RF = GridSearchCV(RandomForestClassifier(),parameters, cv=10)
Grid_RF.fit(X_train, y_train)
# the best hyperparameter combination
print_grid_search_metrics(Grid_RF)
end = time.time()
print(f'For Random Forest, it took {(end - start)/(6 * 4)} seconds per parameter attempt')

In [None]:
best_RF_model = Grid_RF.best_estimator_
best_RF_model.predict(X_test)
print('The test acc of the "best" model for RF is', best_RF_model.score(X_test, y_test))


In [None]:
## Model 4 _ SVC

# timing
start = time.time()
# Possible hyperparamter options for SVC
parameters = {
 'C' : [9, 10, 11, 12],
 'degree': [0,1,2],
}
Grid_SVC = GridSearchCV(SVC(probability = True), parameters, cv=10)
Grid_SVC.fit(X_train, y_train)
# the best hyperparameter combination
print_grid_search_metrics(Grid_SVC)

end = time.time()
print(f'For SVC, it took {(end - start)/(4 * 3)} seconds per parameter attempt')

In [None]:
best_SVC_model = Grid_SVC.best_estimator_
best_SVC_model.predict(X_test)
print('The test acc of the "best" model for SVC is', best_SVC_model.score(X_test, y_test))


In [None]:
# Possible hyperparamter options for GB Classifier
parameters = {
 'learning_rate' : [0.8, 0.9, 1.0],
 'n_estimators': [63, 64, 65],
 'subsample': [0.95, 1.0, 1.05],
 'min_samples_split':[0.725, 0.75, 0.775]
}
Grid_GB = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10)
Grid_GB.fit(X_train, y_train)
# the best hyperparameter combination
print_grid_search_metrics(Grid_GB)

In [None]:
best_GB_model = Grid_GB.best_estimator_
best_GB_model.predict(X_test)
print('The test acc of the "best" model for GB classifier is', best_GB_model.score(X_test, y_test))


In [None]:
# Model 6 - Gaussian Naive Bayes
# Possible hyperparamter options for Gaussian Naive Bayes
parameters = {
 'var_smoothing' : [0.17, 0.18, 0.19],
}
Grid_NB = GridSearchCV(GaussianNB(), parameters, cv=10)
Grid_NB.fit(X_train, y_train)
# the best hyperparameter combination
print_grid_search_metrics(Grid_NB) 


In [None]:
best_NB_model = Grid_NB.best_estimator_
best_NB_model.predict(X_test)
print('The test acc of the "best" model for Gaussian Naive Bayes classifier is', best_NB_model.score(X_test, y_test))

In [None]:
## Model Evaluation - Confusion Matrix (Precision,Recall, Accuracy, f1-Score)

#Precision(PPV, positive predictive value): tp / (tp + fp); High Precision means low fp
#Recall(sensitivity, hit rate, true positive rate): tp / (tp + fn)
#Accurracy: (tp + tn) / (tp + tn + fp + fn)
#f1-Score: (2 * P * R) / (P + R)


In [None]:
# Model 1 - Logistic Regression

cm = metrics.confusion_matrix(y_test, best_LR_model.predict(X_test))
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(metrics.classification_report(y_test, best_LR_model.predict(X_test)))

In [None]:
# Model 2 - KNN Model
cm = metrics.confusion_matrix(y_test, best_KNN_model.predict(X_test))
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(metrics.classification_report(y_test, best_KNN_model.predict(X_test)))

In [None]:
# Model 3 - RF
cm = metrics.confusion_matrix(y_test, best_RF_model.predict(X_test))
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(metrics.classification_report(y_test, best_RF_model.predict(X_test)))


In [None]:
# Model 4 - SVC
cm = metrics.confusion_matrix(y_test, best_SVC_model.predict(X_test))
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(metrics.classification_report(y_test, best_SVC_model.predict(X_test)))

In [None]:
# Model 5 - GB Classifier
cm = metrics.confusion_matrix(y_test, best_GB_model.predict(X_test))
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(metrics.classification_report(y_test, best_GB_model.predict(X_test)))

In [None]:
# Model 6 - Gaussian Naive Bayes
cm = metrics.confusion_matrix(y_test, best_NB_model.predict(X_test))
plt.matshow(cm)
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print(metrics.classification_report(y_test, best_NB_model.predict(X_test)))


In [None]:
## Model Evaluation - ROC & AUC
from sklearn.metrics import roc_curve
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
# Model 1 - Logistic Regression
# Use predict_proba to get the probability results of LR
y_pred_lr = best_LR_model.predict_proba(X_test)[:, 1]
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr)
# drawing ROC curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_lr, tpr_lr, label='LR')
plt.xlabel('False positive rate')

plt.ylabel('True positive rate')
plt.title('ROC curve - LR model')
plt.legend(loc='best')
plt.show()
# AUC
print('The AUC of LR model is', metrics.auc(fpr_lr,tpr_lr))

In [None]:
## Model 2 - KNN
# Use predict_proba to get the probability results of KNN
y_pred_knn = best_KNN_model.predict_proba(X_test)[:, 1]
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_pred_knn)
# drawing ROC curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_knn, tpr_knn, label='KNN')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - KNN model')
plt.legend(loc='best')
plt.show()
# AUC
print('The AUC of KNN model is', metrics.auc(fpr_knn,tpr_knn))


In [None]:
## Model 3 - Random Forest
# Use predict_proba to get the probability results of Random Forest
y_pred_rf = best_RF_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
# drawing ROC curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - RF model')
plt.legend(loc='best')
plt.show()
# AUC
print('The AUC of RF model is', metrics.auc(fpr_rf,tpr_rf))

In [None]:
## Model 4 - SVC
# Use predict_proba to get the probability results of SVC
y_pred_svc = best_SVC_model.predict_proba(X_test)[:, 1]
fpr_svc, tpr_svc, _ = roc_curve(y_test, y_pred_svc)
# drawing ROC curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_svc, tpr_svc, label='SVC')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - SVC model')
plt.legend(loc='best')
plt.show()
# AUC
print('The AUC of SVC model is', metrics.auc(fpr_svc,tpr_svc))

In [None]:
## Model 5 - GB Classifier
# Use predict_proba to get the probability results of GB Classifier
y_pred_gb = best_GB_model.predict_proba(X_test)[:, 1]
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_pred_gb)
# drawing ROC curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_gb, tpr_gb, label='GB')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - GB Classifier')
plt.legend(loc='best')
plt.show()
# AUC
print('The AUC of GB Classifier is', metrics.auc(fpr_gb,tpr_gb))

In [None]:
## Model 6 - Gaussian Naive Bayes Classifier
# Use predict_proba to get the probability results of Gaussian Naive Bayes Classification
y_pred_gb = best_NB_model.predict_proba(X_test)[:, 1]
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_pred_gb)
# drawing ROC curve
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_gb, tpr_gb, label='NB')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - NB Classifier')
plt.legend(loc='best')
plt.show()
# AUC
print('The AUC of NB Classifier is', metrics.auc(fpr_gb,tpr_gb))


In [None]:
#### Insight
#- RF, KNN, SVC excelled in predicting the occurrence of Heart failure through the given features in this dataset, with proper feature preprocessing. 
#-However, we need more data to verify the model prediction & train the model to avoid overfitting