In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing essential libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve
from sklearn.metrics import precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Reading and Visualization the Data¶


In [None]:
# reading data
dat = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
# getting columns
dat.columns

In [None]:
dat

In [None]:
# describing data
dat.describe()

In [None]:
# identifying correlation
dat.corr()

In [None]:
# identifying skewness in data
dat.skew()

In [None]:
# checking for null values
dat.isnull().sum()

In [None]:
# checking for duplicate values
dat.duplicated().sum()

In [None]:
g = sns.countplot(dat['DEATH_EVENT'])#checking for class imbalance
g.set_xticklabels(['0','1'])
plt.show()

above it can be seen that class "zero" is dominant and thus this class imbalance can harm our prediction so we have to counter it

### Plotting
here we will try to plot distribution and boxplot of all the features and scatterplot of DEATH_EVENT and features. this plotting will help us to remove outliers

In [None]:
def plot_numeric_features(feature):#code to visualize distribution, scatterplot and boxplot
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), dpi=110)
    
    sns.distplot(dat[feature], ax=ax1)
    sns.scatterplot(dat[feature], dat["DEATH_EVENT"], ax=ax2)
    sns.boxplot(dat[feature],orient='h', ax=ax3, width=0.2)

    print(f"Skewness Coefficient of {feature} is {dat[feature].skew():.2f}")
    ax1.set_yticks([])
    
    return plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
plot_numeric_features("age").show()


In [None]:
plot_numeric_features("anaemia").show()

In [None]:
plot_numeric_features("creatinine_phosphokinase").show()

clearly in above plot their are outliers(outliers are the random noise in the data which disturbs our prediction. anything outside the rightmost whisker of box-plot is outlier as you can see points lying outside rightmost whiskers these points are outliers)

here any point after 2000 is outlier

In [None]:
plot_numeric_features("diabetes").show()

In [None]:
plot_numeric_features("ejection_fraction").show()

clearly in above plot their are outliers(outliers are the random noise in the data which disturbs our prediction. anything outside the rightmost whisker of box-plot is outlier as you can see points lying outside rightmost whiskers these points are outliers)

here any point after 65 is outlier

In [None]:
plot_numeric_features("high_blood_pressure").show()

In [None]:
plot_numeric_features("platelets").show()

clearly in above plot their are outliers(outliers are the random noise in the data which disturbs our prediction. anything outside the rightmost whisker of box-plot is outlier as you can see points lying outside rightmost whiskers these points are outliers)

here any point after 100000 and below 450000 is outlier

In [None]:
plot_numeric_features("serum_creatinine").show()

clearly in above plot their are outliers(outliers are the random noise in the data which disturbs our prediction. anything outside the rightmost whisker of box-plot is outlier as you can see points lying outside rightmost whiskers these points are outliers)

here any point after 2 is outlier

In [None]:
plot_numeric_features("serum_sodium").show()

clearly in above plot their are outliers(outliers are the random noise in the data which disturbs our prediction. anything outside the rightmost whisker of box-plot is outlier as you can see points lying outside rightmost whiskers these points are outliers)

here any point before 126 is outlier

In [None]:
plot_numeric_features("sex").show()

In [None]:
plot_numeric_features("smoking").show()

In [None]:
plot_numeric_features("time").show()

here we will remove outliers

In [None]:
dat=dat[dat['creatinine_phosphokinase']<2000] #removing outliers in creatinine_phosphokinase

In [None]:
dat=dat[dat['ejection_fraction']<65] #removing outliers in ejection_fraction

In [None]:
dat=dat[(dat['platelets']>100000) & (dat['platelets']<450000)] #removing outliers in platelets

In [None]:
dat=dat[dat['serum_creatinine']<2] #removing outliers in serum_creatinine

In [None]:
dat=dat[dat['serum_sodium']>126] #removing outliers in serum_sodium

In [None]:
dat

let's check for skew agian

In [None]:
dat.skew()# skew can be seen reduced after removing outliers

In [None]:
g = sns.countplot(dat['DEATH_EVENT']) #checking imbalance after removing outliers
g.set_xticklabels(['0','1'])
plt.show()

In [None]:
x = dat[[c for c in dat.columns if c != 'DEATH_EVENT']] #separating features
y = dat['DEATH_EVENT']#separating target

as data is still imbalanced we will need to do either oversampling or undersampling in oversampling we increase the datapoints in the class where data present is less and in undersampling we decrease the amount of datapoints present in the class where data present is more.

as of now we will do oversampling by making synthetic data which is obtained through k nearest neighbour trick

In [None]:
from imblearn.over_sampling import SMOTE #importing important libraries for oversampling data
from collections import Counter

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(x, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_smote))

In [None]:
g = sns.countplot(y_smote) #again plotting for imbalance
g.set_xticklabels(['0','1'])
plt.show()

In [None]:
from sklearn.model_selection import train_test_split #splitting data

X_train, X_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.4, random_state=0)
X_train.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler #scaling all the features

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#print(X_train_scaled)


# Working on various models

In [None]:
# importing various libraries
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from catboost import CatBoostClassifier


In [None]:
# creating instances of classifiers
cla = {
"LogisticRegression":LogisticRegression(),
'Random_Forest': RandomForestClassifier(n_estimators=100),
'Gradient_Boosting': GradientBoostingClassifier(max_depth=5),
'K_NN': KNeighborsClassifier(n_neighbors=5, weights='distance'),
"ADABOOST":AdaBoostClassifier(random_state=1),
"xgboost":xgb.XGBClassifier(random_state=1,learning_rate=0.01),
"CatBoost":CatBoostClassifier()

}

In [None]:
#y_test

### Here prime focus will be on recall as we cannot tolerate false negative(i.e we dont want a person with heart failure risk to be predicted as healthy)

In [None]:
#calculating accuracies and recall
from sklearn.metrics import recall_score
for name, model in cla.items():
    model.fit(X_train_scaled, y_train)
    y_test_pre = model.predict(X_test_scaled)
    print(model.score(X_test_scaled,y_test))
    print(recall_score(y_test, y_test_pre))
    print('-----------')
