# Heart Failure Prediction:

## Importing Dependencies

In [7]:
import kaggle
import selenium

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import kaggle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_confusion_matrix

KAGGLE_DATASET = "andrewmvd/heart-failure-clinical-data"
FILE_NAME = "heart_failure_clinical_records_dataset.csv"
cwd = os.getcwd()
DATASET_FOLDER_PATH = os.path.join(cwd, "dataset")

## Fetch and Load the Dataset

In [None]:
def fetch_dataset():
    """
    Downloads the dataset from Kaggle if not present
    """
    if not os.path.isfile(FILE_NAME):
        os.makedirs(DATASET_FOLDER_PATH, exist_ok=True)
        kaggle.api.authenticate()
        kaggle.api.dataset_download_files(KAGGLE_DATASET, path=DATASET_FOLDER_PATH, unzip=True)

def load_dataset():
    """
    Returns the dataframe containing the dataset
    """
    csv_path = os.path.join(DATASET_FOLDER_PATH, FILE_NAME)
    return pd.read_csv(csv_path)

fetch_dataset()
dataset = load_dataset()


## Exploring the dataset
Description of the features:
- **age**: Age of the patient
- **anaemia**: Decrease of red blood cells or hemoglobin (boolean)
- **creatinine_phosphokinase**: Level of the CPK enzyme in the blood (mcg/L)
- **diabetes**: If the patient has diabetes (boolean)
- **ejection_fraction**: Percentage of blood leaving the heart at each contraction (percentage)
- **high_blood_pressure**: If the patient has hypertension (boolean)
- **platelets**: Platelets in the blood (kiloplatelets/mL)
- **serum_creatinine**: Level of serum creatinine in the blood (mg/dL)
- **serum_sodium**: Level of serum sodium in the blood (mEq/L)
- **sex**: Woman or man (binary)
- **smoking**: If the patient smokes (binary)
- **time**: Nan
- **DEATH_EVENT**: If the heart attack leads to death (binary)

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
### Checking for Null values:
dataset.isnull().sum()

## Distribution of the dataset

In [None]:
plt.figure(figsize=(20,30))

for i, column in enumerate(dataset.drop(["DEATH_EVENT"], axis=1).columns, 1):
    plt.subplot(4,3,i)
    sns.histplot(dataset[column])
plt.show()

## Linear Correlation of the different features:
As we can see **age, serum_creatinine** are linearly correlated with DEATH_EVENT; while **time, ejection_fraction, serum_sodium** are inversely correlated with DEATH_EVENT.

In [None]:
corr = dataset.corr()
ax, fig = plt.subplots(figsize=(12,12))
sns.heatmap(corr, vmax=1,vmin=-1, annot=True, linewidths=.5, cmap="coolwarm")
plt.show()

## Writing a custom Transformer to select only the highly correlated features:

In [None]:
# Get the correlated features with >0.1 correlation
corr_f = corr[abs(corr['DEATH_EVENT']) > 0.1]['DEATH_EVENT']
print(corr_f)
# Get the index(column names) of them
corr_i = list(corr_f.index)
print(corr_i)
# get thier colum index
#print([dataset.columns.get_loc(c) for c in corr_i])

In [None]:
class LinearlyCorrelatedFeatures(BaseEstimator, TransformerMixin):
    """
    A custom transformer that transforms X to have only the features that are linearly correlated to y. The threshold 
    for the amount of correlation can be controlled.

    Requires df that have labeled columns.
    Pipeline will run this: self.fit(X, y, **fit_params).transform(X); Thus need X,y for fit() and it should return
    "self" thus making it possible it chaing like that, ie self.fit.transform. Need only X for transform()
    """

    def __init__(self, correlation_threshold=0.1):
        import pandas as pd
        self.correlation_threshold = correlation_threshold
        self.final_col = []

    def merge_X_y(self,X,y):
        """
        Merges X and y and preserves the labels.
        """
        X_col = list(X.columns)
        y_col = list(y.columns)
        df_col = X_col + y_col 
        df = pd.concat([X,y], axis=1)
        df.columns = df_col

        return df        

    def get_correlated_features(self, X, y):
        """
        Return a list of column labels that are correlated to "y". Includes label of "y" aswell
        args:
            X: mxn numpy array
            y: mx1 numpy vector
        """
        dataset = self.merge_X_y(X,y)
        y_col = str(list(dataset.columns)[-1])

        corr = dataset.corr()
        # Get the correlated features with correlation>threhsold
        #y_label = y_col[0]
        corr_top = corr[abs(corr[y_col]) > self.correlation_threshold][y_col]
        # list of index(column names in dataset)
        corr_features = list(corr_top.index)

        return corr_features
        
    def fit(self, X, y):
        """
        Selects the correlated features and saves it in "final_col"
        """
        if isinstance(X,pandas.core.frame.DataFrame) and isinstance(y, pandas.core.series.Series):
            print("GG")
        X = X.values
        y = y.values
        self.final_col = self.get_correlated_features(X,y)
        return self
        
    def transform(self, X):
        """
        Returns a new X with only the selected features.
        """

        return X.filter(self.final_col)


## Preprocessing:

In [None]:
# Seperating X,y
output_col = "DEATH_EVENT"
X = dataset.drop([output_col], axis=1)
y = dataset[output_col]
y.columns=[output_col]

# Include lin corr features alone
#lin_corr = LinearlyCorrelatedFeatures(correlation_threshold=0.1)
#X = lin_corr.transform(X, y)

# Splitting
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, test_size=0.2)

# Standardize the training set using the training set means and standard deviations. 
# Standardize Test set using the training set means and standard deviations.
#std_scalar = StandardScaler()
#std_scalar.fit(X_train)
#print(std_scalar.mean_)
#X_train = std_scalar.transform(X_train)
#X_test = std_scalar.transform(X_test)

In [None]:
pipeline = Pipeline([
    ("corr_features", LinearlyCorrelatedFeatures(correlation_threshold=0.1)),
    ("std_scalar", StandardScaler()),
])

X_train_p = pipeline.fit_transform(X_train,y_train)
X_test_p = pipeline.transform(X_test)

In [None]:
pd.DataFrame(X_test_p)

# Model Training

In [None]:
def print_grid_scores(grid_search):
    """
    Prints the Mean scores and its corresponding hyperparameter for each candidate in GridSearchCV
    """
    grid_scores = grid_search.cv_results_
    best_mean_score = 0
    best_param = None
    for mean_score, params in zip(grid_scores["mean_test_score"], grid_scores["params"] ):
        if mean_score > best_mean_score:
            best_mean_score = mean_score
            best_param = params

        print("%.4f" % mean_score, params)
    print(f"Best Param:", "%.4f" % best_mean_score, best_param)

def print_test_scores(model, X_test=X_test, y_test=y_test):
    """
    Use the given model to test the model on the Test Set
    Then display the various scores and confusion matrix by comparing against y_test
    """
    y_test_pred = model.predict(X_test)

    # Evalution
    acc = accuracy_score(y_test, y_test_pred)*100
    pre = precision_score(y_test, y_test_pred)*100
    rec = recall_score(y_test, y_test_pred)*100
    f1 = f1_score(y_test, y_test_pred)*100
    print('Accuracy Score : ', "{:.2f}%".format(acc))
    print('Precision Score : ', "{:.2f}%".format(pre))
    print('Recall Score : ', "{:.2f}%".format(rec))
    print('F1 Score : ', "{:.2f}%".format(f1))
    plot_confusion_matrix(model, X_test, y_test)
    plt.show()


## 1. Logistic Regression

In [None]:
RANDOM_STATE=42
REFIT = True
SCORING = "f1"
CV = 5
N_JOBS = 10

log_reg = LogisticRegression(random_state=RANDOM_STATE)
log_reg_parameters = {
    "penalty": ["l2"],
    'C': [0.1, 0.5 ,1]
    }
log_reg_grid = GridSearchCV(log_reg, log_reg_parameters, refit=REFIT, scoring = SCORING, verbose=2, cv=CV, n_jobs=N_JOBS)
log_reg_grid.fit(X_train, y_train)
print_grid_scores(log_reg_grid)

In [None]:
# Since refit=True, the model is refitted using the best parameter and is available via best_estimator
log_reg_best = log_reg_grid.best_estimator_
print_test_scores(log_reg_best)

## 2. Support Vector Classifier:

In [None]:
svc = SVC(random_state=RANDOM_STATE)
svc_parameters = {
    'C': [0.1, 0.5, 1, 5, 10],
    'kernel': ['linear'],
}
svc_grid = GridSearchCV(svc, svc_parameters, refit=REFIT, scoring = SCORING, verbose=2, cv=CV, n_jobs=N_JOBS)
svc_grid.fit(X_train, y_train)
print_grid_scores(svc_grid)

In [None]:
svc_best = svc_grid.best_estimator_
print_test_scores(svc_best)

## 3. Decision Tree Classifier:

In [None]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_parameters = {
    'criterion': ['gini'],
    'max_depth': [1,None],
    'max_features': [2,None],
    'max_leaf_nodes': [1,2,None]
}
dt_grid = GridSearchCV(dt, dt_parameters, refit=REFIT, scoring = SCORING, verbose=2, cv=CV, n_jobs=N_JOBS)
dt_grid.fit(X_train, y_train)
print_grid_scores(dt_grid)

In [None]:
dt_best = dt_grid.best_estimator_
print_test_scores(dt_best)

## 4. Random Forest Classifier

In [None]:
rf = RandomForestClassifier(random_state=RANDOM_STATE)
rf_parameters = {
    'criterion': ['gini'],
    'max_depth': [2,5,None],
    'n_estimators': [100,None],
    'max_leaf_nodes': [5,8, None]
}
rf_grid = GridSearchCV(rf, rf_parameters, refit=REFIT, scoring = SCORING, verbose=2, cv=CV, n_jobs=N_JOBS)
rf_grid.fit(X_train, y_train)
print_grid_scores(rf_grid)

In [None]:
rf_best = rf_grid.best_estimator_
print_test_scores(rf_best)

## 5. Gradient Boosting Classifier

In [None]:
gb_clf = GradientBoostingClassifier(random_state=RANDOM_STATE)
gb_clf_parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.1, 0.5, 0.05],
    'n_estimators': [50,100,150],
    'max_depth': [1,3,5],

}
gb_clf_grid = GridSearchCV(gb_clf, gb_clf_parameters, refit=REFIT, scoring = SCORING, verbose=2, cv=CV, n_jobs=N_JOBS)
gb_clf_grid.fit(X_train, y_train)
print_grid_scores(gb_clf_grid)

In [None]:
print_test_scores(gb_clf_grid.best_estimator_)

## 6. K Nearest Neighbors

In [None]:
knn_clf = KNeighborsClassifier()
knn_clf_parameters = {
    'n_neighbors': [2,3,5,8,10],
}
knn_clf_grid = GridSearchCV(knn_clf, knn_clf_parameters, refit=REFIT, scoring = SCORING, verbose=2, cv=CV, n_jobs=N_JOBS)
knn_clf_grid.fit(X_train, y_train)
print_grid_scores(knn_clf_grid)

In [None]:
print_test_scores(knn_clf_grid.best_estimator_)

# Final Results:

## 1. Logistic Regression:
Accuracy Score :  78.33%

Precision Score :  92.86%

Recall Score :  52.00%

F1 Score :  66.67%

## 2. Support Vector Classifier:
Accuracy Score :  78.33%

Precision Score :  87.50%

Recall Score :  56.00%

F1 Score :  68.29%

## 3. Decision Tree Classifier:
Accuracy Score :  75.00%

Precision Score :  81.25%

Recall Score :  52.00%

F1 Score :  63.41%

## 4. Random Forst Classifier:
Accuracy Score :  75.00%

Precision Score :  81.25%

Recall Score :  52.00%

F1 Score :  63.41%

## 5. Gradient Boosting Classifier:
Accuracy Score :  76.67%

Precision Score :  92.31%

Recall Score :  48.00%

F1 Score :  63.16%

## 6. K Nearest Neighbors:
Accuracy Score :  71.67%

Precision Score :  75.00%

Recall Score :  48.00%

F1 Score :  58.54%