# Model Building

In [153]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [99]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit, cross_val_score
from sklearn.metrics import classification_report

In [2]:
# Balanced Dataset
df = pd.read_csv("heart_failure_clinical_records_dataset_2.csv")

In [19]:
# Unbalanced Datatset
# df_1 = pd.read_csv("heart_failure_clinical_records_dataset.csv")

In [49]:
x = df.drop('DEATH_EVENT',axis='columns')
y = df.DEATH_EVENT

In [50]:
# x1 = df.drop('DEATH_EVENT',axis='columns')
# y1 = df.DEATH_EVENT

In [88]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [89]:
# x1_train,x1_test,y1_train,y1_test = train_test_split(x1,y1,test_size=0.3)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       406 non-null    float64
 1   anaemia                   406 non-null    int64  
 2   creatinine_phosphokinase  406 non-null    int64  
 3   diabetes                  406 non-null    int64  
 4   ejection_fraction         406 non-null    int64  
 5   high_blood_pressure       406 non-null    int64  
 6   platelets                 406 non-null    float64
 7   serum_creatinine          406 non-null    float64
 8   serum_sodium              406 non-null    int64  
 9   sex                       406 non-null    int64  
 10  smoking                   406 non-null    int64  
 11  time                      406 non-null    int64  
 12  DEATH_EVENT               406 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 41.4 KB


In [90]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

x1_train = scaler.fit_transform(x1_train)
x1_test = scaler.fit_transform(x1_test)


### Logistic Regression

In [356]:
lr = LogisticRegression()

model = lr.fit(x_train,y_train)
# model_1 = lr.fit(x1_train,y1_train)

y_pred = model.predict(x_test)
# y1_pred = model.predict(x1_test)

print("Balanced dataset score: ",accuracy_score(y_pred,y_test))
# print("Unbalanced dataset score: ",accuracy_score(y1_pred,y1_test),'\n')
print("Classification report of balanced dataset: \n",classification_report(y_test,y_pred),'\n')
# print("Classification report of unbalanced dataset: \n",classification_report(y1_test,y1_pred))

Balanced dataset score:  0.8114754098360656
Classification report of balanced dataset: 
               precision    recall  f1-score   support

           0       0.86      0.77      0.82        66
           1       0.76      0.86      0.81        56

    accuracy                           0.81       122
   macro avg       0.81      0.81      0.81       122
weighted avg       0.82      0.81      0.81       122
 



In [358]:
# Parameter Tuning
lr = LogisticRegression(random_state=42)
lr_tuned = lr.fit(x_train,y_train)
y_pred = lr_tuned.predict(x_test)
accuracy_score(y_test,y_pred)

0.8114754098360656

### Decision Tree Classifier

In [114]:
tree = DecisionTreeClassifier()
tree_model = tree.fit(x_train,y_train)
y_pred = tree_model.predict(x_test)
print("Acuuracy Score: ",accuracy_score(y_test,y_pred))
# print("Classification Report: \n", classification_report(y_pred,y_test))

Acuuracy Score:  0.7622950819672131


In [360]:
# Parameter Tuning
tree_params = {
    "max_depth":list(range(1,10)),
    "min_samples_split":list(range(2,50))
}
tree_search_model = GridSearchCV(tree,tree_params,cv=10,n_jobs=-1,verbose=2).fit(x_train,y_train)
tree_search_model.best_params_

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


{'max_depth': 7, 'min_samples_split': 3}

In [362]:
tree_tuned = DecisionTreeClassifier(max_depth=7,min_samples_split=3).fit(x_train,y_train)
y_pred = tree_tuned.predict(x_test)
accuracy_score(y_test,y_pred)

0.7786885245901639

### Random Forest Classifier

In [123]:
rf = RandomForestClassifier(random_state=42)
rf_model = rf.fit(x_train,y_train)
y_pred = rf_model.predict(x_test)
print("Accuracy Score: ",accuracy_score(y_test,y_pred))

Accuracy Score:  0.8852459016393442


In [120]:
# Parameter tuning
rf_params = {
    'n_estimators':[10,500,100],
    'min_samples_split':[2,5,10],
    'max_depth':[2,5,8,10],
    'max_features':[2,5,8]
}

rf_search_model = GridSearchCV(rf,rf_params,cv=10,n_jobs=-1,verbose=2).fit(x_train,y_train)
rf_search_model.best_params_

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


{'max_depth': 8,
 'max_features': 2,
 'min_samples_split': 5,
 'n_estimators': 500}

In [137]:

rf_tuned = RandomForestClassifier(max_depth=8,max_features=2,min_samples_split=5,n_estimators=500).fit(x_train,y_train)
y_pred = rf_tuned.predict(x_test)
accuracy_score(y_pred,y_test)

0.8852459016393442

### MLP Classifier   (ANN)

In [156]:

mlp = MLPClassifier(random_state=42).fit(x_train,y_train)
mlp_model = mlp.fit(x_train,y_train)
y_pred=mlp_model.predict(x_test)
accuracy_score(y_pred,y_test)

0.860655737704918

In [182]:
# Paramter tuning
mlp_params = {
    'hidden_layer_sizes':[(10,10,10),(100,100,100),(100,100)],
    'activation':['relu','logistic'],
    'solver':['lbfgs','adam','sgd'],
    'alpha':[0.1,0.01,0.02,0.005,0.0001,0.00001],
}

mlp_search_model = GridSearchCV(mlp,mlp_params,cv=10,n_jobs=-1,verbose=2).fit(x_train,y_train)
mlp_search_model.best_params_

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


{'activation': 'logistic',
 'alpha': 0.01,
 'hidden_layer_sizes': (100, 100),
 'solver': 'lbfgs'}

In [373]:
mlp_tuned = MLPClassifier(activation='logistic',alpha=0.01,
                          hidden_layer_sizes=(100,100),solver='lbfgs',random_state=42).fit(x_train,y_train)
y_pred = mlp_tuned.predict(x_test)
accuracy_score(y_pred,y_test)

0.8114754098360656

### Gradient Boosting Classifer

In [161]:
gb = GradientBoostingClassifier()
gb_model = gb.fit(x_train,y_train)
y_pred = gb_model.predict(x_test)
accuracy_score(y_pred,y_test)

0.860655737704918

In [184]:
#Parameter Tuning
gb_params={
    'learning_rate':[0.01,0.1,1,2],
    'max_depth':[1,3,5,7,9],
    'n_estimators':[5,50,250,500,1000]
}

gb_search_model = GridSearchCV(gb,gb_params,cv=10,n_jobs=-1,verbose=2).fit(x_train,y_train)
gb_search_model.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}

In [187]:
gb_tuned = GradientBoostingClassifier(learning_rate=0.1,max_depth=5,n_estimators=1000).fit(x_train,y_train)
y_pred = gb_tuned.predict(x_test)
accuracy_score(y_pred,y_test)

0.8688524590163934

###  Support Vector Machine (SVM)

In [267]:
svm = SVC(random_state=42)
svm_model = svm.fit(x_train,y_train)
y_pred = svm_model.predict(x_test)
accuracy_score(y_pred,y_test)

0.8114754098360656

In [271]:
# Paramter tuning

svm_params = {
    'C':[1,10,50,100,200,300,1000],
    'kernel':["rbf"],
    'gamma':[0.001,0.01,1]
}

svm_tuned_model = GridSearchCV(svm,svm_params,cv=10,n_jobs=-1,verbose=2).fit(x_train,y_train)
svm_tuned_model.best_params_

Fitting 10 folds for each of 21 candidates, totalling 210 fits


{'C': 200, 'gamma': 0.01, 'kernel': 'rbf'}

In [272]:
svm_tuned = SVC(C=200,gamma=0.01,kernel='rbf').fit(x_train,y_train)
y_pred = svm_tuned.predict(x_test)
accuracy_score(y_pred,y_test)

0.7950819672131147

### Gaussian Naive Bayes

In [305]:
nb = GaussianNB()
nb_model = nb.fit(x_train,y_train)
nb_model.predict(x_test)[0:10]
# y_pred = nb_model.predict(x_test)
# accuracy_score(x_train,y_train)

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)

In [306]:
nb_model.predict_proba(x_test)[0:10]

array([[0.07507459, 0.92492541],
       [0.98587276, 0.01412724],
       [0.23826152, 0.76173848],
       [0.79512938, 0.20487062],
       [0.01642732, 0.98357268],
       [0.49441102, 0.50558898],
       [0.00109159, 0.99890841],
       [0.41029895, 0.58970105],
       [0.03733628, 0.96266372],
       [0.0216331 , 0.9783669 ]])

In [308]:
y_pred = nb_model.predict(x_test)
accuracy_score(y_test,y_pred)

0.7950819672131147

In [325]:
# Parameter Tuning

nb_params = {
    'var_smoothing':np.logspace(0,-9,num=100)
}

nb_search_model = GridSearchCV(estimator = nb, param_grid=nb_params,cv=10,verbose=1,scoring='accuracy').fit(x_train,y_train)
nb_search_model.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


{'var_smoothing': 0.3511191734215131}

In [323]:
nb_tuned = GaussianNB(var_smoothing=1).fit(x_train,y_train)
y_pred = nb_tuned.predict(x_test)
accuracy_score(y_pred,y_test)

0.8278688524590164

### K Nearest Neighbors (KNN)

In [297]:
knn = KNeighborsClassifier()
knn_model = knn.fit(x_train,y_train)
y_pred = knn_model.predict(x_test)
accuracy_score(y_pred,y_test)

0.7131147540983607

In [298]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.70      0.72        66
           1       0.67      0.73      0.70        56

    accuracy                           0.71       122
   macro avg       0.71      0.71      0.71       122
weighted avg       0.72      0.71      0.71       122



In [348]:
knn_params = {
    'n_neighbors':np.arange(1,50)
}
knn_search_model = GridSearchCV(knn,knn_params,cv=10).fit(x_train,y_train)
knn_search_model.best_params_

{'n_neighbors': 11}

In [346]:
knn_tuned = KNeighborsClassifier(n_neighbors=11).fit(x_train,y_train)
y_pred = knn_tuned.predict(x_test)
accuracy_score(y_pred,y_test)

0.7377049180327869

In [376]:
models = [
    lr_tuned,tree_tuned,rf_tuned,mlp_model,gb_tuned,svm_model,
    nb_tuned,knn_tuned
]

results=[]
results = pd.DataFrame(columns=['Model','Accuracy_Score'])
for model in models:
    names=model.__class__.__name__
    y_pred = model.predict(x_test)
    accuracyScore = accuracy_score(y_test,y_pred)
    result = pd.DataFrame([[names,accuracyScore*100]],columns=['Model','Accuracy_Score'])
    results = results.append(result)

results = results.sort_values('Accuracy_Score').reset_index()


In [377]:
results

Unnamed: 0,index,Model,Accuracy_Score
0,0,KNeighborsClassifier,73.770492
1,0,DecisionTreeClassifier,77.868852
2,0,LogisticRegression,81.147541
3,0,SVC,81.147541
4,0,GaussianNB,82.786885
5,0,MLPClassifier,86.065574
6,0,GradientBoostingClassifier,86.885246
7,0,RandomForestClassifier,87.704918
