# Heart Disease Model Training
## Training with Best Accuracy Optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


## 1. Load and Explore Dataset

In [2]:
# Load the dataset
print("Loading heart disease dataset...")
df = pd.read_csv('../heart.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

Loading heart disease dataset...
Dataset shape: (303, 14)

Columns: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Dataset information
print("Dataset Info:")
print(df.info())
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nTarget distribution:")
print(df['target'].value_counts())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
None

Missing values:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
t

## 2. Prepare Data

In [4]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

print(f"Features shape: {X.shape}")
print(f"Features: {X.columns.tolist()}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n‚úÖ Data prepared:")
print(f"  Training set: {X_train.shape}")
print(f"  Test set: {X_test.shape}")

Features shape: (303, 13)
Features: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

‚úÖ Data prepared:
  Training set: (242, 13)
  Test set: (61, 13)


## 3. Train and Evaluate Base Models

In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf', probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    
    results[name] = {'accuracy': accuracy, 'cv_mean': cv_mean, 'cv_std': cv_std}
    
    print(f"\n{name} Results:")
    print(f"  Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Cross-Val Accuracy: {cv_mean:.4f} (+/- {cv_std:.4f})")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))


Training Logistic Regression...

Logistic Regression Results:
  Test Accuracy: 0.8033 (80.33%)
  Cross-Val Accuracy: 0.8309 (+/- 0.0409)

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.86      0.68      0.76        28
     Disease       0.77      0.91      0.83        33

    accuracy                           0.80        61
   macro avg       0.82      0.79      0.80        61
weighted avg       0.81      0.80      0.80        61


Training Random Forest...

Random Forest Results:
  Test Accuracy: 0.8361 (83.61%)
  Cross-Val Accuracy: 0.8309 (+/- 0.0469)

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.95      0.68      0.79        28
     Disease       0.78      0.97      0.86        33

    accuracy                           0.84        61
   macro avg       0.87      0.82      0.83        61
weighted avg       0.86      0.84      0.83        61


Training SVM (RBF Kernel)...

SVM

In [6]:
# Summary of base models
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('accuracy', ascending=False)
print("\nüìä Base Model Comparison:")
print(results_df)


üìä Base Model Comparison:
                     accuracy   cv_mean    cv_std
Random Forest        0.836066  0.830867  0.046863
SVM (RBF Kernel)     0.819672  0.801956  0.038753
Gradient Boosting    0.819672  0.802041  0.064680
Logistic Regression  0.803279  0.830867  0.040931


## 4. Hyperparameter Tuning

In [7]:
# Random Forest Tuning
print("\nüîß Tuning Random Forest...")
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_train_scaled, y_train)
rf_best = rf_grid.best_estimator_
rf_accuracy = accuracy_score(y_test, rf_best.predict(X_test_scaled))

print(f"\n‚úÖ Random Forest Best Params: {rf_grid.best_params_}")
print(f"‚úÖ Random Forest Tuned Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")


üîß Tuning Random Forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits

‚úÖ Random Forest Best Params: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
‚úÖ Random Forest Tuned Accuracy: 0.8197 (81.97%)


In [8]:
# SVM Tuning
print("\nüîß Tuning SVM...")
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}

svm_grid = GridSearchCV(
    SVC(probability=True, random_state=42),
    svm_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X_train_scaled, y_train)
svm_best = svm_grid.best_estimator_
svm_accuracy = accuracy_score(y_test, svm_best.predict(X_test_scaled))

print(f"\n‚úÖ SVM Best Params: {svm_grid.best_params_}")
print(f"‚úÖ SVM Tuned Accuracy: {svm_accuracy:.4f} ({svm_accuracy*100:.2f}%)")


üîß Tuning SVM...
Fitting 5 folds for each of 40 candidates, totalling 200 fits

‚úÖ SVM Best Params: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
‚úÖ SVM Tuned Accuracy: 0.8033 (80.33%)


In [9]:
# Gradient Boosting Tuning
print("\nüîß Tuning Gradient Boosting...")
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

gb_grid.fit(X_train_scaled, y_train)
gb_best = gb_grid.best_estimator_
gb_accuracy = accuracy_score(y_test, gb_best.predict(X_test_scaled))

print(f"\n‚úÖ Gradient Boosting Best Params: {gb_grid.best_params_}")
print(f"‚úÖ Gradient Boosting Tuned Accuracy: {gb_accuracy:.4f} ({gb_accuracy*100:.2f}%)")


üîß Tuning Gradient Boosting...
Fitting 5 folds for each of 72 candidates, totalling 360 fits

‚úÖ Gradient Boosting Best Params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
‚úÖ Gradient Boosting Tuned Accuracy: 0.8361 (83.61%)


## 5. Select Best Model and Save

In [10]:
# Compare tuned models
tuned_models = {
    "Random Forest (Tuned)": (rf_best, rf_accuracy),
    "SVM (Tuned)": (svm_best, svm_accuracy),
    "Gradient Boosting (Tuned)": (gb_best, gb_accuracy)
}

final_best_name = max(tuned_models.items(), key=lambda x: x[1][1])[0]
final_best_model = tuned_models[final_best_name][0]
final_best_accuracy = tuned_models[final_best_name][1]

print("\n" + "="*60)
print("üèÜ FINAL BEST MODEL")
print("="*60)
print(f"\nBest Model: {final_best_name}")
print(f"Best Accuracy: {final_best_accuracy:.4f} ({final_best_accuracy*100:.2f}%)")

# Final evaluation
y_pred_final = final_best_model.predict(X_test_scaled)
print(f"\nüìã Final Classification Report:")
print(classification_report(y_test, y_pred_final, target_names=['No Disease', 'Disease']))
print(f"\nüìä Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))


üèÜ FINAL BEST MODEL

Best Model: Gradient Boosting (Tuned)
Best Accuracy: 0.8361 (83.61%)

üìã Final Classification Report:
              precision    recall  f1-score   support

  No Disease       0.95      0.68      0.79        28
     Disease       0.78      0.97      0.86        33

    accuracy                           0.84        61
   macro avg       0.87      0.82      0.83        61
weighted avg       0.86      0.84      0.83        61


üìä Confusion Matrix:
[[19  9]
 [ 1 32]]


In [11]:
# Save the best model and scaler
model_filename = 'heart_disease_model.sav'
scaler_filename = 'heart_scaler.sav'

pickle.dump(final_best_model, open(model_filename, 'wb'))
pickle.dump(scaler, open(scaler_filename, 'wb'))

print(f"\n‚úÖ Model saved to: {model_filename}")
print(f"‚úÖ Scaler saved to: {scaler_filename}")
print(f"\nModel Type: {type(final_best_model).__name__}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {X.columns.tolist()}")
print("\nüéâ Training complete!")


‚úÖ Model saved to: heart_disease_model.sav
‚úÖ Scaler saved to: heart_scaler.sav

Model Type: GradientBoostingClassifier
Number of features: 13
Feature names: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

üéâ Training complete!
