## ♥ Failure : Comparison of 6 classification models

![](https://patients.healthquest.org/wp-content/uploads/2018/05/congestive-heart-failure-feature2.jpg)

## 1. Import Libraries and Dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
 
# Model Selection and utilities
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix

# Model Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#READING DATASET
df = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df.head()

In this dataset:
* It contains 299 rows (patient information).
* It contains 13 columns (12 features and DEATH_EVENT target variable).
* 10 features are integer type.
* 3 features are float type.

***FEATURES***

**age**: age of patient

**anaemia**: Decrease of red blood cells or hemoglobin

**creatinine_phosphokinase**: Level of the CPK enzyme in the blood (mcg/L)

**diabetes**: If the patient has diabetes

**ejection_fraction**: Percentage of blood leaving the heart at each contraction (percentage)

**high_blood_pressure**: If the patient has hypertension platelets: Platelets in the blood

**serum_creatinine**: Level of serum creatinine in the blood (mg/dL)

**serum_sodium**: Level of serum sodium in the blood (mEq/L)

**sex**: Woman or man (binary)

**smoking**: If the patient smokes or not

**time**: Follow-up period (days)

**DEATH_EVENT**: If the patient deceased during the follow-up period

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df.info()

## 2. Data analysis and Visualization

In [None]:
# Storing categorical and numerical features names in different Series
cat_columns = ["anaemia","diabetes","high_blood_pressure","sex","smoking","DEATH_EVENT"]
num_columns = pd.Series(df.columns)
num_columns = num_columns[~num_columns.isin(cat_columns)]

Frequency distribution of Categorical Variables

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(10, 6))
titles = list(df[cat_columns])

ax_title_pairs = zip(axs.flat, titles)

for ax, title in ax_title_pairs:
    sns.countplot(x=title, data=df, palette='muted', ax=ax)
    ax.set_title(title)
    ax.set_xlabel('')

plt.tight_layout()

Frequency distribution of Continuous Variables

In [None]:
df_grouped = df.groupby(by='DEATH_EVENT')
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 8))
titles = list(df[num_columns])

ax_title_pairs = zip(axs.flat, titles)

for ax, title in ax_title_pairs:
  sns.distplot(df_grouped.get_group(0)[title], bins=10, ax=ax, label='No')
  sns.distplot(df_grouped.get_group(1)[title], bins=10, ax=ax, label='Yes')
  ax.legend(title='DEATH_EVENT')

axs.flat[-1].remove()
axs.flat[-2].remove()
fig.tight_layout()

## 3. Modelling on raw dataset

In [None]:
X_raw = df.iloc[:,:-1].to_numpy()
y_raw = df['DEATH_EVENT'].to_numpy()

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_raw, y_raw, test_size = 0.2, random_state =1)

result_dict = {}

Utility Functions

In [None]:
def fit_model_with_grid_search(model, X_t, y_t, parameters, scoring='f1', verbose=1):
  model = GridSearchCV(
      model,
      parameters,
      scoring=scoring
  )
  
  model.fit(X_t, y_t)
  
  if verbose:
    print(f'\nbest_params_: {model.best_params_}')
    print(f'Mean cross-validated F1 score of the best_estimator: {model.best_score_:.4f}')
      
  return model

def print_metrics(cf, X_t, y_t):
  y_pred = classifier.predict(X_t)

  accuracy = accuracy_score(y_t, y_pred)
  f1 = f1_score(y_t, y_pred, average='macro')
  precision = precision_score(y_t, y_pred, average='macro')
  recall = recall_score(y_t, y_pred, average='macro')
  
  print(f'\nAccuracy (test set)\t| {accuracy:.4f}')
  print(f'F1 (test set)\t\t| {f1:.4f}')
  print(f'Precision (test set)\t| {precision:.4f}')
  print(f'Recall (test set)\t| {recall:.4f}\n')
  # print()
  cm = confusion_matrix(y_t, y_pred)
  plt.figure(figsize=(5,3))
  sns.heatmap(cm,annot=True, linewidths=.5)
  plt.show()

  return {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall,
  }

### 1. Logistic Regression

In [None]:
print("*Logistic Regression*")

model_logistic_regression = LogisticRegression()
parameters = {
    'C': [0.01, 0.1, 1],
}
classifier = fit_model_with_grid_search(
    model_logistic_regression,
    X_train_r,
    y_train_r,
    parameters,
    scoring='f1',
)

result_dict['Logistic Regression'] = print_metrics(classifier, X_test_r, y_test_r)

### 2. KNN

In [None]:
print("*K Nearest Neighours*")

model_knn = KNeighborsClassifier()
parameters = {
    "n_neighbors": list(range(2, 21)),
    "weights": ['uniform', 'distance'],
}
classifier = fit_model_with_grid_search(
    model_knn,
    X_train_r,
    y_train_r,
    parameters,
    scoring='accuracy',
)

result_dict['KNN'] = print_metrics(classifier, X_test_r, y_test_r)

In [None]:
list1 = []
for neighbors in range(2,21):
  classifier = KNeighborsClassifier(n_neighbors=neighbors)
  classifier.fit(X_train_r, y_train_r)
  y_pred = classifier.predict(X_test_r)
  list1.append(accuracy_score(y_test_r, y_pred))
plt.plot(list(range(2,21)), list1)
plt.show()

### 3. SVM

In [None]:
print("*Support Vector Machine*")

model_svm = SVC()
parameters = {
    "C": [0.001, 0.01, 0.1, 1],
}
classifier = fit_model_with_grid_search(
    model_svm,
    X_train_r,
    y_train_r,
    parameters,
    scoring='accuracy',
)

result_dict['SVM'] = print_metrics(classifier, X_test_r, y_test_r)

### 4. Decision Tree

In [None]:
print("*Decision Tree Classsifier*")

model_decision = DecisionTreeClassifier()
parameters = {
  "max_depth": [1, 2, 3, 5, 10, None], 
  "max_leaf_nodes": list(range(2, 15)),
  "criterion": ["entropy"],
}
classifier = fit_model_with_grid_search(
    model_decision,
    X_train_r,
    y_train_r,
    parameters,
    scoring='f1',
)

result_dict['Decision Tree'] = print_metrics(classifier, X_test_r, y_test_r)

### 5. Random Forest

In [None]:
print("*Random Forest Classifier*")
model_rand_forest = RandomForestClassifier()
parameters = {
    "n_estimators": list(range(10,21)),
}
classifier = fit_model_with_grid_search(
    model_rand_forest,
    X_train_r,
    y_train_r,
    parameters,
    scoring='f1',
)

result_dict['Random Forest'] = print_metrics(classifier, X_test_r, y_test_r)

### 6. Naive Bayes

In [None]:
classifier = GaussianNB()
classifier.fit(X_train_r, y_train_r)

print("*Gaussian NaiveBayes*")
result_dict['NaiveBayes'] = print_metrics(classifier, X_test_r, y_test_r)

## 4. Feature Selection

### 1. Co-relation matrix

In [None]:
ax, fig = plt.subplots(figsize=(12,12))
corr = df.corr()
sns.heatmap(corr, vmin=-1, cmap='coolwarm', annot=True)
plt.xticks(rotation=30, ha='right')
plt.show()

In above correlation matrix, we see features relationship each other. This relationships can be useful to set up model. If the relationship how is close and is strong, it can be impact to use them in order to set up true model. In this dataset, we will look relationship of DEATH_EVENT with other features. If relationship between them is big from 0.1, This features can be important features,which heart attack triggers. 

In [None]:
corr[abs(corr['DEATH_EVENT']) > 0.1]['DEATH_EVENT']

### 2. Extra Tree Classifier 

In [None]:
# Feature Selection

plt.rcParams['figure.figsize']=12,6 
sns.set_style("darkgrid")

x1 = df.iloc[:, :-1]
y1 = df.iloc[:,-1]

from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(x1,y1)
# print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x1.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
def plot_histogram(dataset, feature, color, title, labels):
  fig = px.histogram(
    dataset, 
    x=feature,
    color=color, 
    marginal="box",
    hover_data=dataset.columns,
    title = title, 
    labels = labels,
    width=800,
    template="plotly_white",
  )
  fig.show()

In [None]:
plot_histogram(df, 'age', 'DEATH_EVENT', 'AGE Vs DEATH_EVENT', {"age": "AGE"})

In [None]:
plot_histogram(df, 'ejection_fraction', 'DEATH_EVENT', 'EJECTION FRACTION Vs DEATH_EVENT', {"ejection_fraction": "EJECTION FRACTION"})

In [None]:
plot_histogram(df, 'serum_sodium', 'DEATH_EVENT', 'SERUM SODIUM Vs DEATH_EVENT', {"serum_sodium": "SERUM SODIUM"})

In [None]:
plot_histogram(df, 'serum_creatinine', 'DEATH_EVENT', 'SERUM CREATININE Vs DEATH_EVENT', {"serum_creatinine": "SERUM CREATININE"})

## 5. Model Training and Prediction on selected features

In [None]:
X = df[['ejection_fraction', 'serum_creatinine', 'serum_sodium', 'time', 'age']].to_numpy()

y = df['DEATH_EVENT'].to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =1)

Data Scaling

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### 1. K Nearest Neighours

In [None]:
list1 = []
for neighbors in range(2,15):
  classifier = KNeighborsClassifier(n_neighbors=neighbors)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X_test)
  list1.append(accuracy_score(y_test,y_pred))
plt.plot(list(range(2,15)), list1)
plt.show()

In [None]:
print("*K Nearest Neighours (Transformed Data)*")

classifier = KNeighborsClassifier(n_neighbors=11)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

result_dict['KNN(Selected Features)'] = print_metrics(classifier, X_test, y_test)

### 2. SVM

In [None]:
list1 = []
for c in [0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.75, 0.8, 0.9, 1]:
  classifier = SVC(C = c)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X_test)
  list1.append(accuracy_score(y_test,y_pred))
plt.plot([0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.5, 0.75, 0.8, 0.9, 1], list1)
plt.show()

In [None]:
print("*Support Vector machine (Transformed Data)*")

classifier = SVC(C = 0.2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

result_dict['SVM (Selected Features)'] = print_metrics(classifier, X_test, y_test)

## 6. Model performance comparison

In [None]:
Results = pd.DataFrame(result_dict).T
Results