In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read cleaned dataset
df = pd.read_excel("Cleaned_Dataset_v2.xlsx")

df_processed = df.copy()
    
# Handle mixed data types and convert to appropriate types
for col in df_processed.columns:
    if df_processed[col].dtype == 'object':
        # Try to convert to numeric first
        try:
            df_processed[col] = pd.to_numeric(df_processed[col])
        except:
            pass

# Separate numeric and categorical columns after type conversion
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
categorical_cols = df_processed.select_dtypes(include=['object', 'category']).columns

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    if col in df_processed.columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le

# Remove non-numeric columns
df_processed = df_processed.select_dtypes(include=[np.number])

# Define target and features
X = df_processed.drop("HadHeartAttack", axis=1)
y = df_processed["HadHeartAttack"]

# Split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf_model = rf_model.predict(X_test)

# Calculate Performance
acc = accuracy_score(y_test, y_pred_rf_model)
cm = confusion_matrix(y_test, y_pred_rf_model)
report = classification_report(y_test, y_pred_rf_model, output_dict=False)

print(f"✅ Accuracy: {acc}")
print(f"\n📉 Confusion Matrix:\n {cm}")
print(f"\n📋 Classification Report:\n {report}")

# Optional: Feature importance
feature_importance = rf_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print(f"\n🎯 Top 10 Most Important Features:")
print(importance_df.head(10))

✅ Accuracy: 0.94786811669256

📉 Confusion Matrix:
 [[70691   495]
 [ 3431   692]]

📋 Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     71186
           1       0.58      0.17      0.26      4123

    accuracy                           0.95     75309
   macro avg       0.77      0.58      0.62     75309
weighted avg       0.93      0.95      0.93     75309


🎯 Top 10 Most Important Features:
               Feature  Importance
9            HadAngina    0.159174
31                 BMI    0.085016
30              Weight    0.074488
29              Height    0.058105
0                State    0.057494
28                 Age    0.048031
7           SleepHours    0.046291
3   PhysicalHealthDays    0.036806
27                Race    0.032782
2        GeneralHealth    0.029106
