In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# 1. Load dataset (update this path if needed)
csv_path = r"C:\Users\SANTOSH KUMAR SAHOO\Downloads\archive (5)\heart_disease_uci.csv"
df = pd.read_csv(csv_path)

# 2. Check columns
print("Columns in dataset:\n", df.columns.tolist())
print("\nDataset shape:", df.shape)
print(df.head())

# 3. Handle missing values
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns

df[num_cols] = df[num_cols].fillna(df[num_cols].median())

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# 4. Encode categorical columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# 5. Features (X) and Target (y)
# If 'target' column exists, use it. Otherwise, fallback to last column.
if "target" in df.columns:
    target_col = "target"
else:
    target_col = df.columns[-1]

X = df.drop(target_col, axis=1)
y = df[target_col]

print(f"\nTarget column used: '{target_col}'")

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7. Build Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 8. Predictions
y_pred = rf_model.predict(X_test)

# 9. Evaluation
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy:", round(acc, 3))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=3))

# 10. Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:\n", feature_importance)



Columns in dataset:
 ['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

Dataset shape: (920, 16)
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0

  df[col] = df[col].fillna(df[col].mode()[0])



Accuracy: 0.69

Confusion Matrix:
 [[73  6  2  1  0]
 [ 6 42  4  1  0]
 [ 4  4  6  8  0]
 [ 1  9  5  6  0]
 [ 0  2  3  1  0]]

Classification Report:
               precision    recall  f1-score   support

           0      0.869     0.890     0.880        82
           1      0.667     0.792     0.724        53
           2      0.300     0.273     0.286        22
           3      0.353     0.286     0.316        21
           4      0.000     0.000     0.000         6

    accuracy                          0.690       184
   macro avg      0.438     0.448     0.441       184
weighted avg      0.655     0.690     0.671       184


Feature Importance:
      Feature  Importance
0         id    0.226153
1        age    0.100891
9     thalch    0.098121
11   oldpeak    0.082519
6       chol    0.078567
5   trestbps    0.075194
4         cp    0.066623
3    dataset    0.056695
14      thal    0.045230
10     exang    0.039704
13        ca    0.036429
8    restecg    0.031003
12     slope

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
