### Importing required libraries

In [1]:
import numpy as np 
import pandas as pd 

### Loading the Dataset

In [2]:
heart_data = pd.read_csv("../input/heart-failure-records/heart_failure_clinical_records_dataset.csv")

In [3]:
heart_data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
heart_data.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [5]:
heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [6]:
heart_data.shape

(299, 13)

### Taking the target data in y and training data in X

In [7]:
y = heart_data['DEATH_EVENT'].values
X = heart_data.copy()
X.drop(['DEATH_EVENT'],axis=1,inplace=True)

#### As the data is already noise free and does not contain any missing values, we hop towards Feature Scaling

### Feature Selection (method = Information Gain)

In [8]:
from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(X,y)
mutual_data = pd.Series(mutual_info, index = X.columns)
mutual_data.sort_values(ascending = False)

time                        0.245235
serum_creatinine            0.071753
high_blood_pressure         0.052395
ejection_fraction           0.039677
creatinine_phosphokinase    0.038252
anaemia                     0.030487
serum_sodium                0.025787
age                         0.024342
sex                         0.020091
smoking                     0.004897
diabetes                    0.000000
platelets                   0.000000
dtype: float64

In [9]:
index = mutual_data.sort_values(ascending=False).index
index

Index(['time', 'serum_creatinine', 'high_blood_pressure', 'ejection_fraction',
       'creatinine_phosphokinase', 'anaemia', 'serum_sodium', 'age', 'sex',
       'smoking', 'diabetes', 'platelets'],
      dtype='object')

### Dropping the non required features

#### As per the score of information gain we are taking only top 6 columns and the rest are dropped

In [10]:
X.drop(index[6:],axis=1,inplace=True)

In [11]:
X

Unnamed: 0,anaemia,creatinine_phosphokinase,ejection_fraction,high_blood_pressure,serum_creatinine,time
0,0,582,20,1,1.9,4
1,0,7861,38,0,1.1,6
2,0,146,20,0,1.3,7
3,1,111,20,0,1.9,7
4,1,160,20,0,2.7,8
...,...,...,...,...,...,...
294,0,61,38,1,1.1,270
295,0,1820,38,0,1.2,271
296,0,2060,60,0,0.8,278
297,0,2413,38,0,1.4,280


In [12]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Splitting the data into train-test split

In [13]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=68)

#### Training on different models to see the accuracy

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic.fit(Xtrain,ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

### K-Nearest Neighbour

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(Xtrain,ytrain)

KNeighborsClassifier()

### Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB

naive_bayes = GaussianNB()
naive_bayes.fit(Xtrain,ytrain)

GaussianNB()

### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(Xtrain,ytrain)

DecisionTreeClassifier()

### Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
random_forest.fit(Xtrain,ytrain)

RandomForestClassifier()

### Support Vector Machine

In [19]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(Xtrain,ytrain)


SVC()

### Creating a list of all the objects of models

In [20]:
models = [logistic,knn,naive_bayes,decision_tree,random_forest,svc]

### Finally predicting the accuracy of all the trained models

In [21]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

for i in models:
    y_pred = i.predict(Xtest)

    print("\n",str(i).center(55,"*"),"\n")
    print("Training Accuracy Score = ",accuracy_score(ytrain,i.predict(Xtrain)))
    print("Testing Accuracy Score = ",accuracy_score(ytest,y_pred))
    print("\nConfusion Metrix\n", confusion_matrix(ytest,y_pred))
    print()
    print("Classification Report".center(55,"-"),"\n\n",classification_report(ytest,y_pred),"\n\n")


 ******************LogisticRegression()***************** 

Training Accuracy Score =  0.8410041841004184
Testing Accuracy Score =  0.8666666666666667

Confusion Metrix
 [[41  4]
 [ 4 11]]

-----------------Classification Report----------------- 

               precision    recall  f1-score   support

           0       0.91      0.91      0.91        45
           1       0.73      0.73      0.73        15

    accuracy                           0.87        60
   macro avg       0.82      0.82      0.82        60
weighted avg       0.87      0.87      0.87        60
 



 *****************KNeighborsClassifier()**************** 

Training Accuracy Score =  0.8326359832635983
Testing Accuracy Score =  0.8333333333333334

Confusion Metrix
 [[41  4]
 [ 6  9]]

-----------------Classification Report----------------- 

               precision    recall  f1-score   support

           0       0.87      0.91      0.89        45
           1       0.69      0.60      0.64        15

    accu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Conclusion

#### As we can see the Random Forest algorithm is giving the best accuracy.