In [4]:

from google.colab import files
uploaded = files.upload()


Saving heart_failure_clinical_records_dataset (1).csv to heart_failure_clinical_records_dataset (1) (1).csv


In [5]:
import pandas as pd


df = pd.read_csv("heart_failure_clinical_records_dataset (1).csv")


print("Shape of dataset:", df.shape)
print("\nColumn names:", df.columns.tolist())
df.head()


Shape of dataset: (299, 13)

Column names: ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT']


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [6]:
X = df.drop("DEATH_EVENT", axis=1)
y = df["DEATH_EVENT"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
from imblearn.combine import SMOTETomek
smt = SMOTETomek(random_state=42)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_classif)),
    ('rf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'select__k': ['all', 10, 12],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 5, 10],
    'rf__min_samples_split': [2, 4],
    'rf__min_samples_leaf': [1, 2],
    'rf__class_weight': ['balanced']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_resampled, y_resampled)


In [9]:
y_proba = grid_search.predict_proba(X_test)[:,1]


In [10]:
import numpy as np
from sklearn.metrics import accuracy_score

best_threshold = 0.5
best_accuracy = 0

for threshold in np.arange(0.1, 0.9, 0.01):
    y_pred = (y_proba >= threshold).astype(int)
    acc = accuracy_score(y_test, y_pred)
    if acc > best_accuracy:
        best_accuracy = acc
        best_threshold = threshold


In [11]:
from sklearn.metrics import classification_report

final_preds = (y_proba >= best_threshold).astype(int)
print("Best CV Accuracy:", grid_search.best_score_)
print("Best Params:", grid_search.best_params_)
print("Best Threshold:", best_threshold)
print("Final Test Accuracy:", accuracy_score(y_test, final_preds))
print("Classification Report:\n", classification_report(y_test, final_preds))


Best CV Accuracy: 0.8754856614246067
Best Params: {'rf__class_weight': 'balanced', 'rf__max_depth': 5, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 4, 'rf__n_estimators': 100, 'select__k': 10}
Best Threshold: 0.6099999999999998
Final Test Accuracy: 0.85
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90        41
           1       0.86      0.63      0.73        19

    accuracy                           0.85        60
   macro avg       0.85      0.79      0.81        60
weighted avg       0.85      0.85      0.84        60



In [13]:

joblib.dump(grid_search.best_estimator_, 'heart_failure_model.pkl')


['heart_failure_model.pkl']

In [14]:
import joblib
joblib.dump(grid_search.best_estimator_, 'heart_failure_model.pkl')


['heart_failure_model.pkl']

In [15]:
from google.colab import files
files.download('heart_failure_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>