In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score, f1_score, roc_auc_score
import numpy as np
from sklearn.model_selection import GridSearchCV
from IPython.display import clear_output
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [3]:
# Load encoded data
encoded_data = pd.read_csv('../data/cleaned data/kaggle data/encoded_kaggleheart.csv')
encoded_data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_Male,ChestPainType_Asymptomatic,ChestPainType_Atypical Angina,ChestPainType_Non-Anginal Pain,ChestPainType_Typical Angina,RestingECG_Left Ventricular Hypertrophy,RestingECG_Normal,RestingECG_ST-T Wave Abnormality,ExerciseAngina_Yes,ST_Slope_Downsloping,ST_Slope_Flat,ST_Slope_Upsloping
0,40,140,289,0,172,0.0,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,54,150,195,0,122,0.0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
914,68,144,193,1,141,3.4,1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
915,57,130,131,0,115,1.2,1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
916,57,130,236,0,174,0.0,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
# Split into features and target
X = encoded_data.drop('HeartDisease', axis=1)
y = encoded_data['HeartDisease']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Define a larger grid with ranges for n_estimators and max_depth
param_grid = {
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'bootstrap': [True, False],
    'n_estimators': np.arange(50, 301, 10),
    'max_depth': np.arange(5, 31, 5)
}

# Perform grid search with the larger grid
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters found
best_params = grid_search.best_params_
print(f"Best parameters found from the expanded grid search: {best_params}")

Fitting 5 folds for each of 7800 candidates, totalling 39000 fits
Best parameters found from the expanded grid search: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 290}


In [5]:
# Retrieve the best parameters from grid search
best_params = grid_search.best_params_

# Train the final model with the best parameters
final_model = RandomForestClassifier(**best_params, random_state=42)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

# Calculate final metrics
final_accuracy = accuracy_score(y_test, y_pred)
final_precision = precision_score(y_test, y_pred)
final_recall = recall_score(y_test, y_pred)
final_r2_score = r2_score(y_test, y_pred)
final_f1_score = f1_score(y_test, y_pred)
final_auc_score = roc_auc_score(y_test, y_pred)

# Print the final metrics
print("Final Model Performance:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1 Score: {final_f1_score:.4f}")
print(f"AUC-ROC: {final_auc_score:.4f}")
print(f"R2 Score: {final_r2_score:.4f}")
print(f"Used Parameters: {best_params}")

Final Model Performance:
Accuracy: 0.8804
Precision: 0.8972
Recall: 0.8972
F1 Score: 0.8972
AUC-ROC: 0.8772
R2 Score: 0.5087
Used Parameters: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 290}
