# Heart Failure Prediction 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('heart.csv')

In [3]:
# Categorical features
categorical_features = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Numerical features
numerical_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

# Target variable
target_variable = 'HeartDisease'

In [6]:
# Check for missing values
missing_values = data.isnull().sum()
if missing_values.any():
    # If there are missing values, you can handle them here using methods like imputation
    print("Missing values found in the dataset. You can handle them using appropriate techniques.")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [7]:
print("Sample data from the dataset:")
print(data.head())

Sample data from the dataset:
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [8]:
#Splitting Data into Training and Testing Sets
# Split the dataset into features (X) and target variable (y)
X = data.drop(target_variable, axis=1)
y = data[target_variable]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
# Apply preprocessing to training and testing data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

In [70]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [12]:
# Display the shape of preprocessed training and testing sets
print("\nShape of preprocessed training set:", X_train_prep.shape)
print("Shape of preprocessed testing set:", X_test_prep.shape)


Shape of preprocessed training set: (734, 20)
Shape of preprocessed testing set: (184, 20)


**Now Implementing Individual Models
Random Forest, Decision Tree, SVM**

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

**___Random Forest___**

In [16]:
rf_classifier = RandomForestClassifier(random_state=42)

In [17]:
#hyperparameters grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [18]:
# Grid Search to find the best hyperparameters
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train_prep, y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
# best hyperparameters for Random Forest
print("Best hyperparameters for Random Forest:", rf_grid_search.best_params_)

Best hyperparameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [47]:
from sklearn.metrics import accuracy_score

rf_classifier = RandomForestClassifier( **rf_grid_search.best_params_,random_state=42)
rf_classifier.fit(X_train_prep, y_train)
rf_train_acc = accuracy_score(y_train, rf_classifier.predict(X_train_prep))
rf_test_acc = accuracy_score(y_test, rf_classifier.predict(X_test_prep))
print("Random Forest - Train Accuracy:", rf_train_acc)
print("Random Forest - Test Accuracy:", rf_test_acc)

Random Forest - Train Accuracy: 0.9291553133514986
Random Forest - Test Accuracy: 0.8804347826086957


**___Decision Tree___**

In [21]:
dt_classifier = DecisionTreeClassifier(random_state=42)

In [22]:
#hyperparameters grid
dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [23]:
# Perform Grid Search to find the best hyperparameters
dt_grid_search = GridSearchCV(dt_classifier, dt_param_grid, cv=5, scoring='accuracy')
dt_grid_search.fit(X_train_prep, y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [24]:
# best hyperparameters for Decision Tree
print("Best hyperparameters for Decision Tree:", dt_grid_search.best_params_)

Best hyperparameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [43]:
dt_classifier = DecisionTreeClassifier(**dt_grid_search.best_params_,random_state=42)
dt_classifier.fit(X_train_prep, y_train)
dt_train_acc = accuracy_score(y_train, dt_classifier.predict(X_train_prep))
dt_test_acc = accuracy_score(y_test, dt_classifier.predict(X_test_prep))
print("Decision Tree - Train Accuracy:", dt_train_acc)
print("Decision Tree - Test Accuracy:", dt_test_acc)

Decision Tree - Train Accuracy: 0.9318801089918256
Decision Tree - Test Accuracy: 0.782608695652174


**___SVM (Support Vector Machine)___**

In [26]:
svm_classifier = SVC(random_state=42)

In [27]:
# hyperparameters grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

In [28]:
# Grid Search to find the best hyperparameters
svm_grid_search = GridSearchCV(svm_classifier, svm_param_grid, cv=5, scoring='accuracy')
svm_grid_search.fit(X_train_prep, y_train)

0,1,2
,estimator,SVC(random_state=42)
,param_grid,"{'C': [0.1, 1, ...], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf', ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [29]:
# best hyperparameters for SVM
print("Best hyperparameters for SVM:", svm_grid_search.best_params_)

Best hyperparameters for SVM: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}


In [48]:
svm_classifier = SVC(**svm_grid_search.best_params_,random_state=42)
svm_classifier.fit(X_train_prep, y_train)
svm_train_acc = accuracy_score(y_train, svm_classifier.predict(X_train_prep))
svm_test_acc = accuracy_score(y_test, svm_classifier.predict(X_test_prep))
print("SVM - Train Accuracy:", svm_train_acc)
print("SVM - Test Accuracy:", svm_test_acc)

SVM - Train Accuracy: 0.8950953678474114
SVM - Test Accuracy: 0.8586956521739131


**___XGBoost (Extreme Gradient Boosting)___**

In [32]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import sklearn
import xgboost

In [33]:
xgb_classifier = XGBClassifier(random_state=42)

In [34]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'gamma': [0, 0.1, 0.2]
}

In [35]:
# Grid Search to find the best hyperparameters
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy')
xgb_grid_search.fit(X_train_prep, y_train)

0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'gamma': [0, 0.1, ...], 'learning_rate': [0.1, 0.01, ...], 'max_depth': [3, 4, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [36]:
# best hyperparameters for XGBoost
print("Best hyperparameters for XGBoost:", xgb_grid_search.best_params_)

Best hyperparameters for XGBoost: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}


In [37]:
y_train_pred_xgb = xgb_grid_search.predict(X_train_prep)
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
print("Train Accuracy (XGBoost):", train_accuracy_xgb)
y_test_pred_xgb = xgb_grid_search.predict(X_test_prep)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print("Test Accuracy (XGBoost):", test_accuracy_xgb)

Train Accuracy (XGBoost): 0.9332425068119891
Test Accuracy (XGBoost): 0.8804347826086957


**Bagging Ensemble (Random Forest And SVM)**

In [None]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Ensure the correct initialization of classifiers
rf_classifier = RandomForestClassifier(**rf_grid_search.best_params_, random_state=42)
svm_classifier = SVC(**svm_grid_search.best_params_)

# Corrected BaggingClassifier (replace base_estimator with estimator)
bagging_rf = BaggingClassifier(estimator=rf_classifier, n_estimators=10, random_state=42)
bagging_svm = BaggingClassifier(estimator=svm_classifier, n_estimators=10, random_state=42)

# Voting Classifier (Combining the Bagging models)
bagging_classifier = VotingClassifier(estimators=[
    ('rf', bagging_rf),
    ('svm', bagging_svm)
], voting='hard')

# Training the Bagging ensemble
bagging_classifier.fit(X_train_prep, y_train)

# Predictions on training set
y_train_pred_bagging = bagging_classifier.predict(X_train_prep)
train_accuracy_bagging = accuracy_score(y_train, y_train_pred_bagging)
print("Bagging Ensemble - Train Accuracy:", train_accuracy_bagging)

# Predictions on testing set
y_test_pred_bagging = bagging_classifier.predict(X_test_prep)
test_accuracy_bagging = accuracy_score(y_test, y_test_pred_bagging)
print("Bagging Ensemble - Test Accuracy:", test_accuracy_bagging)


Bagging Ensemble - Train Accuracy: 0.8991825613079019
Bagging Ensemble - Test Accuracy: 0.8586956521739131





**Bagging Ensemble
(Random Forest, Decision Tree, SVM, and XGBoost)**

In [39]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [40]:
# Initialize individual models
rf_classifier = RandomForestClassifier(**rf_grid_search.best_params_, random_state=42)
dt_classifier = DecisionTreeClassifier(**dt_grid_search.best_params_, random_state=42)
svm_classifier = SVC(**svm_grid_search.best_params_, random_state=42)
xgb_classifier = XGBClassifier(**xgb_grid_search.best_params_, random_state=42)

In [49]:
# Initialize Bagging ensemble for each individual model
bagging_rf = BaggingClassifier(estimator=rf_classifier, n_estimators=10, random_state=42)
bagging_dt = BaggingClassifier(estimator=dt_classifier, n_estimators=10, random_state=42)
bagging_svm = BaggingClassifier(estimator=svm_classifier, n_estimators=10, random_state=42)
bagging_xgb = BaggingClassifier(estimator=xgb_classifier, n_estimators=10, random_state=42)

In [50]:
# Combine Bagging ensembles of individual models
bagging_classifier = VotingClassifier(estimators=[
    ('rf', bagging_rf),
    ('dt', bagging_dt),
    ('svm', bagging_svm),
    ('xgb', bagging_xgb)
])

In [51]:
# Training the Bagging ensemble
bagging_classifier.fit(X_train_prep, y_train)

0,1,2
,estimators,"[('rf', ...), ('dt', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
# Predictions on training set
y_train_pred_bagging = bagging_classifier.predict(X_train_prep)
train_accuracy_bagging = accuracy_score(y_train, y_train_pred_bagging)
print("Bagging Ensemble - Train Accuracy:", train_accuracy_bagging)

# Predictions on testing set
y_test_pred_bagging = bagging_classifier.predict(X_test_prep)
test_accuracy_bagging = accuracy_score(y_test, y_test_pred_bagging)
print("Bagging Ensemble - Test Accuracy:", test_accuracy_bagging)


# from google.colab import files
# import joblib
# joblib.dump(bagging_classifier, 'trained_model.pkl')
# files.download('trained_model.pkl')

Bagging Ensemble - Train Accuracy: 0.9168937329700273
Bagging Ensemble - Test Accuracy: 0.8858695652173914


**___User Input___**

In [None]:
from sklearn.preprocessing import LabelEncoder

import joblib

# Load the trained model
bagging_classifier = joblib.load('/content/drive/MyDrive/trained_model.pkl')

def get_user_input():
    print("Please enter the following information:")
    Age = int(input("Age (in years): "))
    Sex = input("Sex (M/F): ")
    ChestPainType = input("Chest Pain Type (ATP, NAP, ATA, ASY, TA): ")
    RestingBP = int(input("Resting Blood Pressure (mm Hg): "))
    Cholesterol = int(input("Cholesterol (mg/dL): "))
    FastingBS = int(input("Fasting Blood Sugar (0 for No, 1 for Yes): "))
    RestingECG = input("Resting ECG (Normal, ST, LVH): ")
    MaxHR = int(input("Max Heart Rate (bpm): "))
    ExerciseAngina = input("Exercise-Induced Angina (Y/N): ")
    Oldpeak = float(input("Oldpeak (ST depression induced by exercise relative to rest): "))
    ST_Slope = input("ST Slope (Up, Flat, Down): ")

    return Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope

def preprocess_input(Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope):
    # Encode categorical variables
    sex_encoder = LabelEncoder()
    chest_pain_encoder = LabelEncoder()
    resting_ecg_encoder = LabelEncoder()
    exercise_angina_encoder = LabelEncoder()
    st_slope_encoder = LabelEncoder()

    Sex = sex_encoder.fit_transform([Sex])[0]
    ChestPainType = chest_pain_encoder.fit_transform([ChestPainType])[0]
    RestingECG = resting_ecg_encoder.fit_transform([RestingECG])[0]
    ExerciseAngina = exercise_angina_encoder.fit_transform([ExerciseAngina])[0]
    ST_Slope = st_slope_encoder.fit_transform([ST_Slope])[0]

    return Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope

def predict_heart_disease(Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope):
    # Preprocess user input
    Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope = preprocess_input(Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope)

    # Make prediction
    prediction = bagging_classifier.predict([[Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope]])

    # Display prediction result to the user
    if prediction[0] == 1:
        return "Based on the provided information, the prediction is: Heart Disease Present"
    else:
        return "Based on the provided information, the prediction is: No Heart Disease"

# Get user input
Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope = get_user_input()

# Make prediction
result = predict_heart_disease(Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope)

# Display prediction result
print(result)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/trained_model.pkl'

1**Evalution**

In [53]:
from sklearn.metrics import classification_report, roc_auc_score

In [54]:
# Train the models
rf_classifier.fit(X_train_prep, y_train)
dt_classifier.fit(X_train_prep, y_train)
svm_classifier.fit(X_train_prep, y_train)
xgb_classifier.fit(X_train_prep, y_train)

# Make predictions on testing set for individual models
y_test_pred_rf = rf_classifier.predict(X_test_prep)
y_test_pred_dt = dt_classifier.predict(X_test_prep)
y_test_pred_svm = svm_classifier.predict(X_test_prep)
y_test_pred_xgb = xgb_classifier.predict(X_test_prep)

# Predictions on testing set for Bagging ensemble
y_test_pred_bagging = bagging_classifier.predict(X_test_prep)
# Evaluate individual models
print("Random Forest:")
print(classification_report(y_test, y_test_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred_rf))
print()

print("Decision Tree:")
print(classification_report(y_test, y_test_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred_dt))
print()

print("SVM:")
print(classification_report(y_test, y_test_pred_svm))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred_svm))
print()

print("XGBoost:")
print(classification_report(y_test, y_test_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_test_pred_xgb))
print()


Random Forest:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        77
           1       0.90      0.90      0.90       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.88      0.88      0.88       184

ROC-AUC: 0.8771695594125501

Decision Tree:
              precision    recall  f1-score   support

           0       0.69      0.86      0.77        77
           1       0.88      0.73      0.80       107

    accuracy                           0.78       184
   macro avg       0.79      0.79      0.78       184
weighted avg       0.80      0.78      0.78       184

ROC-AUC: 0.7930574098798397

SVM:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        77
           1       0.89      0.87      0.88       107

    accuracy                           0.86       184
   macro avg       0.85      0.86  

**Fine-tuning hyperparameters**

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

In [59]:
# Initialize Bagging ensemble with a single base estimator
bagging_classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=42)

TypeError: BaggingClassifier.__init__() got an unexpected keyword argument 'base_estimator'

In [None]:
# Define hyperparameters grid for Bagging ensemble
bagging_param_grid = {
    'n_estimators': [10, 50, 100],  # Number of base estimators
    'max_samples': [0.5, 0.7, 1.0],  # Proportion of samples to draw from X to train each base estimator
    'max_features': [0.5, 0.7, 1.0]  # Proportion of features to draw from X to train each base estimator
}

In [None]:
# Initialize Bagging ensemble
bagging_classifier = BaggingClassifier(base_estimator=[rf_classifier, dt_classifier, svm_classifier, xgb_classifier],
                                       random_state=42)

In [None]:
# Perform Grid Search to find the best hyperparameters for Bagging ensemble
bagging_grid_search = GridSearchCV(bagging_classifier, bagging_param_grid, cv=5, scoring='accuracy')
bagging_grid_search.fit(X_train_prep, y_train)

In [None]:
# Print the best hyperparameters for Bagging ensemble
print("Best hyperparameters for Bagging Ensemble:", bagging_grid_search.best_params_)

In [None]:
# Evaluate the fine-tuned Bagging ensemble
bagging_best_estimator = bagging_grid_search.best_estimator_

# Predictions on testing set for fine-tuned Bagging ensemble
y_test_pred_bagging_tuned = bagging_best_estimator.predict(X_test_prep)

# Calculate accuracy
test_accuracy_bagging_tuned = accuracy_score(y_test, y_test_pred_bagging_tuned)
print("Fine-tuned Bagging Ensemble - Test Accuracy:", test_accuracy_bagging_tuned)

In [55]:
from sklearn.model_selection import cross_val_score

# Step 8: Validation and Cross-Validation

# Perform cross-validation on the fine-tuned Bagging ensemble
cv_scores = cross_val_score(bagging_best_estimator, X_train_prep, y_train, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)

# Calculate mean and standard deviation of cross-validation scores
print("Mean CV accuracy:", cv_scores.mean())
print("Standard deviation of CV accuracy:", cv_scores.std())

NameError: name 'bagging_best_estimator' is not defined