In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

# Load dataset
dataset = pd.read_csv("new_heart.csv").drop(columns=['Unnamed: 0'], errors='ignore')  # Ignore error if column is missing

# One-hot encoding categorical variables
df2 = pd.get_dummies(dataset, drop_first=True)

# Define independent and dependent variables
indep_X = df2.drop('target', axis=1)  # Features
dep_Y = df2['target']  # Target variable

# Feature selection using SelectKBest (Chi-Square)
selector = SelectKBest(score_func=chi2, k=6)
kbest = selector.fit_transform(indep_X, dep_Y)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(kbest, dep_Y, test_size=0.2, random_state=0)

# Define the Random Forest model with hyperparameter tuning
def optimized_random_forest(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees
        'max_depth': [10, 20, None],  # Maximum depth of trees
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at a leaf node
    }
    
    rf = RandomForestClassifier(criterion='entropy', random_state=0)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Best Parameters: {grid_search.best_params_}")
    return best_rf, accuracy, report, cm

# Run the optimized Random Forest
final_rf, final_acc, final_report, final_cm = optimized_random_forest(X_train, y_train, X_test, y_test)

# Print results
print("\nFinal Random Forest Accuracy:", final_acc)
print("\nClassification Report:\n", final_report)
print("\nConfusion Matrix:\n", final_cm)


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}

Final Random Forest Accuracy: 0.8688524590163934

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        27
           1       0.88      0.88      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61


Confusion Matrix:
 [[23  4]
 [ 4 30]]


In [2]:
selected_columns = indep_X.columns[selector.get_support()]
print(f"Selected Columns: {selected_columns}")


Selected Columns: Index(['age', 'cp', 'thalach', 'exang', 'oldpeak', 'ca'], dtype='object')


In [3]:
import joblib

# Save the best trained model
joblib.dump(final_rf, 'random_forest_heart_model.pkl')
print("Model saved successfully!")


Model saved successfully!


In [9]:
import joblib
import numpy as np

# Load the trained model
loaded_model = joblib.load('random_forest_heart_model.pkl')

# Function to take user input and predict
def predict_heart_disease():
    print("\nEnter the required details for heart disease prediction:")
    
    age = int(input("Age: "))
    cp = int(input("Chest Pain Type (0-3): "))
    thalach = int(input("Maximum Heart Rate Achieved: "))
    exang = int(input("Exercise Induced Angina (0 = No, 1 = Yes): "))
    oldpeak = float(input("ST Depression Induced by Exercise: "))
    ca = int(input("Number of Major Vessels (0-3): "))

    # Convert input to a NumPy array
    user_input = np.array([[age, cp, thalach, exang, oldpeak, ca]])

    # Make prediction
    prediction = loaded_model.predict(user_input)

    # Display the result
    if prediction[0] == 1:
        print("\nPrediction: High risk of heart disease 😟")
    else:
        print("\nPrediction: Low risk of heart disease 🙂")

# Run the function
predict_heart_disease()



Enter the required details for heart disease prediction:


Age:  57
Chest Pain Type (0-3):  0
Maximum Heart Rate Achieved:  123
Exercise Induced Angina (0 = No, 1 = Yes):  1
ST Depression Induced by Exercise:  0.2
Number of Major Vessels (0-3):  0



Prediction: Low risk of heart disease 🙂


In [7]:
dataset


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
298,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
299,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
300,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0
