# Heart Failure Prediction Project - Model Training (XGBoost)

This notebook covers the steps for loading, preprocessing, training, and evaluating an XGBoost model for Heart Failure Prediction. The goal is to achieve an accuracy over 80%.

In [1]:
## 1. Install Required Libraries
!pip install pandas numpy scikit-learn xgboost
print("Libraries installed successfully!")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Libraries installed successfully!


In [2]:
## 2. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle
import os

# Import XGBoost
import xgboost as xgb

print("All necessary libraries imported.")

All necessary libraries imported.


In [3]:
## 3. Load Dataset
# IMPORTANT: Upload 'heart_failure_clinical_records_dataset (1).csv' to your Colab session storage first.
file_path = 'heart_failure_clinical_records_dataset (1).csv'
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully!")
    print("Shape of the dataset:", df.shape)
    print("First 5 rows:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's uploaded to Colab.")
    print("Please upload the dataset file to your Colab environment and try again.")
    # For demonstration, creating a dummy dataframe if file is not found
    data = {
        'age': np.random.randint(40, 90, 10),
        'anaemia': np.random.randint(0, 2, 10),
        'creatinine_phosphokinase': np.random.randint(50, 8000, 10),
        'diabetes': np.random.randint(0, 2, 10),
        'ejection_fraction': np.random.randint(10, 80, 10),
        'high_blood_pressure': np.random.randint(0, 2, 10),
        'platelets': np.random.randint(100000, 500000, 10),
        'serum_creatinine': np.random.rand(10) * 5 + 0.5,
        'serum_sodium': np.random.randint(120, 150, 10),
        'sex': np.random.randint(0, 2, 10),
        'smoking': np.random.randint(0, 2, 10),
        'time': np.random.randint(1, 300, 10),
        'DEATH_EVENT': np.random.randint(0, 2, 10)
    }
    df = pd.DataFrame(data)
    print("Using a small dummy dataset for demonstration due to FileNotFoundError.")
    print(df.head())


Dataset loaded successfully!
Shape of the dataset: (299, 13)
First 5 rows:
    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  DEATH_EVENT

In [4]:
## 4. Data Preprocessing

### 4.1. Check for Missing Values
print("Missing values per column:")
print(df.isnull().sum())

if df.isnull().sum().sum() == 0:
    print("\nNo missing values found. Dataset is clean.")
else:
    print("\nMissing values found. Consider imputation strategies if this is not a dummy dataset.")

### 4.2. Separate Features (X) and Target (y)
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

### 4.3. Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### 4.4. Feature Scaling (Standardization)
scaler = StandardScaler()

# Fit on training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for better readability (optional, but good for inspection)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

print("X_train_scaled head:")
print(X_train_scaled.head())



Missing values per column:
age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

No missing values found. Dataset is clean.
Features (X) shape: (299, 12)
Target (y) shape: (299,)
X_train shape: (239, 12)
X_test shape: (60, 12)
y_train shape: (239,)
y_test shape: (60,)
X_train_scaled head:
        age   anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0 -0.269050  1.110696                 -0.200735 -0.900337           0.176528   
1 -0.706883 -0.900337                 -0.534318  1.110696           1.847425   
2  1.219579 -0.900337                 -0.020580 -0.900337          -1.494369   
3  0.256348 -0.900337                 -0

In [5]:
## 5. Hyperparameter Tuning with GridSearchCV
# Define the XGBoost model with fixed parameters that won't be tuned
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic', # For binary classification
    eval_metric='logloss',       # Evaluation metric
    use_label_encoder=False,     # Suppress warning for future versions
    random_state=42
)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200], # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2], # Step size shrinkage
    'max_depth': [3, 5, 7], # Maximum depth of a tree
    'subsample': [0.7, 0.8, 0.9], # Subsample ratio of the training instance
    'colsample_bytree': [0.7, 0.8, 0.9] # Subsample ratio of columns when constructing each tree
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2 # Set to 1 or 2 for more detailed output during tuning
)

print("Starting GridSearchCV... This may take some time.")
grid_search.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Get the best model from GridSearchCV
model = grid_search.best_estimator_
print("Best XGBoost model assigned for further evaluation and saving.")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [6]:
## 6. Model Evaluation (with Tuned Model)
y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8500

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90        41
           1       0.86      0.63      0.73        19

    accuracy                           0.85        60
   macro avg       0.85      0.79      0.81        60
weighted avg       0.85      0.85      0.84        60


Confusion Matrix:
[[39  2]
 [ 7 12]]


In [7]:
## 7. Save the Trained Model, Scaler, and Feature Defaults
# Create a directory for saved models if it doesn't exist
if not os.path.exists('saved_models'):
    os.makedirs('saved_models')
    print("Created 'saved_models' directory.")
else:
    print("'saved_models' directory already exists.")

# Save the model
model_filename = 'trained_model.pkl'
with open(os.path.join('saved_models', model_filename), 'wb') as file:
    pickle.dump(model, file)
print(f"Model saved as {os.path.join('saved_models', model_filename)}")

# It's crucial to save the scaler as well, as new data for prediction must be scaled using the same scaler
scaler_filename = 'scaler.pkl'
with open(os.path.join('saved_models', scaler_filename), 'wb') as file:
    pickle.dump(scaler, file)
print(f"Scaler saved as {os.path.join('saved_models', scaler_filename)}")

# Calculate and save default values for features
default_feature_values = X_train.mean().to_dict() # Using mean as default for continuous features

# For binary features, it's often better to use the mode (most frequent value) or a sensible default (e.g., 0 for no disease)
binary_features = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
for bf in binary_features:
    if bf in default_feature_values:
        # Set to mode (most frequent value) for binary features
        default_feature_values[bf] = int(X_train[bf].mode()[0]) # Cast to int for binary

default_values_filename = 'default_feature_values.pkl'
with open(os.path.join('saved_models', default_values_filename), 'wb') as file:
    pickle.dump(default_feature_values, file)
print(f"Default feature values saved as {os.path.join('saved_models', default_values_filename)}")

print("\nDefault values calculated:")
for k, v in default_feature_values.items():
    print(f"  {k}: {v:.2f}")


'saved_models' directory already exists.
Model saved as saved_models/trained_model.pkl
Scaler saved as saved_models/scaler.pkl
Default feature values saved as saved_models/default_feature_values.pkl

Default values calculated:
  age: 61.07
  anaemia: 0.00
  creatinine_phosphokinase: 602.79
  diabetes: 0.00
  ejection_fraction: 37.89
  high_blood_pressure: 0.00
  platelets: 263670.55
  serum_creatinine: 1.39
  serum_sodium: 136.53
  sex: 1.00
  smoking: 0.00
  time: 127.22


In [8]:
## 8. Download Saved Files (Optional, for Local Deployment)
# This cell is for downloading the saved model, scaler, and default values
# if you plan to deploy the Flask app locally.
# You will need to manually download the 'saved_models' folder.

#from google.colab import files

#print("To download the 'saved_models' folder:")
#print("1. Click the folder icon on the left sidebar.")
#print("2. Navigate to the 'saved_models' folder.")
#print("3. Right-click on the 'saved_models' folder and select 'Download'.")
#print("This will download a .zip file containing your model, scaler, and default values.")

# You can also download individual files if preferred:
# files.download('saved_models/trained_model.pkl')
# files.download('saved_models/scaler.pkl')
# files.download('saved_models/default_feature_values.pkl')


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4e097d3f-c241-40be-acfd-1a4a1c8dfa41' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>