# Heart Disease Prediction Model Training


In [31]:
# Import necessary libraries
import pandas as pd
import numpy as np
import warnings
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import pickle

# Configure environment
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


## Data Loading

In [32]:
# Load the heart disease dataset
df = pd.read_csv('Heart_Disease_Prediction.csv')

print(f"Dataset loaded: {df.shape[0]} samples, {df.shape[1]} features")
print("Data loading complete")

Dataset loaded: 270 samples, 14 features
Data loading complete


In [33]:
# Basic data quality check
duplicates = df.duplicated().sum()
print(f"Duplicate rows found: {duplicates}")
print("Data quality check complete")

Duplicate rows found: 0
Data quality check complete


## Data Preprocessing


In [34]:
# Data preprocessing
print("Data Preprocessing...")

# Separate features and target
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Clean target values - remove extra spaces and standardize
y_cleaned = y.str.strip()

# Convert target to binary encoding
y_binary = (y_cleaned == 'Presence').astype(int)

print(f"Features: {X.shape}")
print(f"Target distribution - Absence: {(y_binary == 0).sum()}, Presence: {(y_binary == 1).sum()}")
print("Data preprocessing complete")

Data Preprocessing...
Features: (270, 13)
Target distribution - Absence: 150, Presence: 120
Data preprocessing complete


In [35]:
# Train-Test Split and Feature Scaling
print("Splitting data and scaling features...")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_binary
)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print("Data split and scaling complete")

Splitting data and scaling features...
Training set: 216 samples
Test set: 54 samples
Data split and scaling complete


## ML Models Implementation


In [36]:
# Random Forest Classifier
print("Training Random Forest Classifier...")

rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2
)

rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print(f"Random Forest - Test Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")

Training Random Forest Classifier...
Random Forest - Test Accuracy: 0.8333 (83.33%)


In [37]:
# Decision Tree Classifier
print("Training Decision Tree Classifier...")

dt_model = DecisionTreeClassifier(
    random_state=42,
    max_depth=8,
    min_samples_split=10,
    min_samples_leaf=5,
    criterion='gini'
)

dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print(f"Decision Tree - Test Accuracy: {dt_accuracy:.4f} ({dt_accuracy*100:.2f}%)")

Training Decision Tree Classifier...
Decision Tree - Test Accuracy: 0.7963 (79.63%)


In [38]:
# Logistic Regression
print("Training Logistic Regression...")

lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    C=1.0,
    solver='liblinear'
)

lr_model.fit(X_train_scaled, y_train)
lr_predictions = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, lr_predictions)

print(f"Logistic Regression - Test Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")



Training Logistic Regression...
Logistic Regression - Test Accuracy: 0.8519 (85.19%)


## Model Comparison and Evaluation

In [39]:
# Model Selection
print("Model Selection")
print("=" * 20)

# Compare model accuracies
models_data = {
    'Random Forest': {'model': rf_model, 'accuracy': rf_accuracy, 'scaled_features': False},
    'Decision Tree': {'model': dt_model, 'accuracy': dt_accuracy, 'scaled_features': False},
    'Logistic Regression': {'model': lr_model, 'accuracy': lr_accuracy, 'scaled_features': True}
}

# Select best model based on accuracy
best_model_name = max(models_data.keys(), key=lambda x: models_data[x]['accuracy'])
best_model_data = models_data[best_model_name]

print(f"Model Accuracies:")
for name, data in models_data.items():
    print(f"  {name}: {data['accuracy']:.4f}")

print(f"\nBest Model: {best_model_name}")
print(f"Best Accuracy: {best_model_data['accuracy']:.4f}")

Model Selection
Model Accuracies:
  Random Forest: 0.8333
  Decision Tree: 0.7963
  Logistic Regression: 0.8519

Best Model: Logistic Regression
Best Accuracy: 0.8519


## Model Deployment

In [40]:
# Save the best performing model
print("Model Deployment")
print("=" * 20)

# Save the best model
best_model = best_model_data['model']
with open("Frontend/heartdiseaseprediction.model", "wb") as f:
    pickle.dump(best_model, f)

# Save the scaler
with open("Frontend/scaler.model", "wb") as f:
    pickle.dump(scaler, f)

# Save model metadata
model_info = {
    'best_model_name': best_model_name,
    'best_model_accuracy': best_model_data['accuracy'],
    'features': list(X.columns),
    'target_encoding': {'Absence': 0, 'Presence': 1},
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'requires_scaling': best_model_data['scaled_features']
}

with open("Frontend/model_info.pkl", "wb") as f:
    pickle.dump(model_info, f)

print(f"Model training complete!")
print(f"Best Model: {best_model_name} (Accuracy: {best_model_data['accuracy']:.4f})")
print("All models and metadata saved to Frontend/ directory")

Model Deployment
Model training complete!
Best Model: Logistic Regression (Accuracy: 0.8519)
All models and metadata saved to Frontend/ directory
