## Running K-fold Cross-Validation 

Running in a separate notebook to determine if the Random Forest model is overfit

In [4]:
# Importing dependencies
import joblib
import pandas as pd
from pathlib import Path
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, recall_score, classification_report

In [2]:
# Loading model
rf_model = joblib.load("rf_model.joblib")

In [6]:
# Loading dataset
file_path = Path('Resources/cleaned_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,BMI,heartRate,glucose,CHDRisk,MAP
0,1.0,39,4,0.0,0,0,0,0,0,195,26.97,80,77,0,82.0
1,0.0,46,2,0.0,0,0,0,0,0,250,28.73,95,76,0,94.333333
2,1.0,48,1,1.0,20,0,0,0,0,245,25.34,75,70,0,95.833333
3,0.0,61,3,1.0,30,0,0,1,0,225,28.58,65,103,1,113.333333
4,0.0,46,3,1.0,23,0,0,0,0,285,23.1,85,85,0,99.333333


In [7]:
# Preparing the data
y = df['CHDRisk']
X = df.drop(columns='CHDRisk')
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [8]:
# Standardizing the data
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

In [9]:
# Setting up k-fold cross-validation
k = 5
kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=1)

In [None]:
scoring = ['accuracy', 'recall']
cv_results = cross_validate(rf_model, X_resampled_scaled, y_resampled, cv=kf, scoring=scoring)
