In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

# Load dataset
dataset = pd.read_csv("new_heart.csv").drop(columns=['Unnamed: 0'], errors='ignore')  # Ignore error if column is missing

# One-hot encoding categorical variables
df2 = pd.get_dummies(dataset, drop_first=True)

# Define independent and dependent variables
indep_X = df2.drop('target', axis=1)  # Features
dep_Y = df2['target']  # Target variable

# Feature selection using SelectKBest (Chi-Square)
selector = SelectKBest(score_func=chi2, k=6)
kbest = selector.fit_transform(indep_X, dep_Y)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(kbest, dep_Y, test_size=0.1, random_state=0)

# Define the Random Forest model with hyperparameter tuning
def optimized_random_forest(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees
        'max_depth': [10, 20, None],  # Maximum depth of trees
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at a leaf node
    }
    
    rf = RandomForestClassifier(criterion='entropy', random_state=0)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Best Parameters: {grid_search.best_params_}")
    return best_rf, accuracy, report, cm

# Run the optimized Random Forest
final_rf, final_acc, final_report, final_cm = optimized_random_forest(X_train, y_train, X_test, y_test)

# Print results
print("\nFinal Random Forest Accuracy:", final_acc)
print("\nClassification Report:\n", final_report)
print("\nConfusion Matrix:\n", final_cm)


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}

Final Random Forest Accuracy: 0.9032258064516129

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91        17
           1       0.92      0.86      0.89        14

    accuracy                           0.90        31
   macro avg       0.91      0.90      0.90        31
weighted avg       0.90      0.90      0.90        31


Confusion Matrix:
 [[16  1]
 [ 2 12]]


In [14]:
selected_columns = indep_X.columns[selector.get_support()]
print(f"Selected Columns: {selected_columns}")


Selected Columns: Index(['age', 'cp', 'thalach', 'exang', 'oldpeak', 'ca'], dtype='object')
