In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

# Load and preprocess data
def load_and_preprocess_data(df):
    # Create copy of dataframe
    data = df.copy()

    # Create feature matrix X and target vector y
    X = data.drop('target', axis=1)
    y = data['target']

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# Train model
def train_model(X_train, y_train):
    # Initialize RandomForestClassifier with optimized hyperparameters
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42
    )

    # Train the model
    rf_model.fit(X_train, y_train)
    return rf_model

# Evaluate model
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Calculate feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nFeature Importance:")
    print(feature_importance)

    return feature_importance



In [3]:
# Load data
data = pd.read_csv('/content/Heart Disease dataset.csv')


In [4]:
# Process data
X_train_scaled, X_test_scaled, y_train, y_test, scaler = load_and_preprocess_data(data)
feature_names = data.drop('target', axis=1).columns



In [5]:
# Train model
model = train_model(X_train_scaled, y_train)



In [6]:
# Evaluate model
feature_importance = evaluate_model(model, X_test_scaled, y_test)



Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.83      0.86        29
           1       0.85      0.91      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61


Feature Importance:
     feature  importance
9    oldpeak    0.140247
11        ca    0.128733
2         cp    0.120215
12      thal    0.110517
7    thalach    0.107038
8      exang    0.081898
0        age    0.079609
3   trestbps    0.063337
4       chol    0.062113
10     slope    0.047390
1        sex    0.038293
6    restecg    0.014442
5        fbs    0.006168


In [7]:
# Save model and scaler
joblib.dump(model, 'heart_disease_model.joblib')
joblib.dump(scaler, 'scaler.joblib')


['scaler.joblib']

In [8]:
# Save feature names and their importance scores
feature_importance.to_csv('feature_importance.csv', index=False)