<a href="https://colab.research.google.com/github/RifatMuhtasim/Machine_Learning/blob/main/Miscellaneous_Topics/Ensemble_Learning_Bagging_With_Heart_Disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Ensemble Learning Bagging:** Ensemble learning is a machine learning technique that combines the predictions of multiple models to improve the accuracy and robustness of the final prediction. Ensemble learning is a general meta approach to machine learning that seeks better predictive performance by combining the predictions from two or more models.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Data Preprocessing

In [2]:
# Read dataset
df = pd.read_csv("https://raw.githubusercontent.com/codebasics/py/master/ML/19_Bagging/Exercise/heart.csv")
print("Shape:", df.shape)
df.head()

Shape: (918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


# Remove Outliers

In [4]:
num_columns = df.select_dtypes(include=['int64', 'float64'])
num_columns.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,40,140,289,0,172,0.0,0
1,49,160,180,0,156,1.0,1
2,37,130,283,0,98,0.0,0
3,48,138,214,0,108,1.5,1
4,54,150,195,0,122,0.0,0


In [5]:
for i in num_columns:
  print(f"For {i} value:")
  value = df[ df[i] > ( df[i].mean() + 3*df[i].std() ) ]

  if value.shape[0] > 0:
    display(value)
    df = df[ df[i] <= ( df[i].mean() + 3*df[i].std() ) ]
  else:
    print("No Outliers")

For Age value:
No Outliers
For RestingBP value:


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
592,61,M,ASY,190,287,1,LVH,150,Y,2.0,Down,1
732,56,F,ASY,200,288,1,LVH,133,Y,4.0,Down,1
759,54,M,ATA,192,283,0,LVH,195,N,0.0,Up,1


For Cholesterol value:


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


For FastingBS value:
No Outliers
For MaxHR value:
No Outliers
For Oldpeak value:


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
166,50,M,ASY,140,231,0,ST,140,Y,5.0,Flat,1
702,59,M,TA,178,270,0,LVH,145,N,4.2,Down,0
771,55,M,ASY,140,217,0,Normal,111,Y,5.6,Down,1
791,51,M,ASY,140,298,0,Normal,122,Y,4.2,Flat,1
850,62,F,ASY,160,164,0,LVH,145,N,6.2,Down,1
900,58,M,ASY,114,318,0,ST,140,N,4.4,Down,1


For HeartDisease value:
No Outliers


# Convert text columns to numbers

In [6]:
# Show the unique value for the ordinal data
print(df['RestingECG'].unique())
print(df['ExerciseAngina'].unique())
print(df['ST_Slope'].unique())

['Normal' 'ST' 'LVH']
['N' 'Y']
['Up' 'Flat' 'Down']


In [7]:
# Replace the Ordinal Data

df['RestingECG'].replace({
    "Normal": 1,
    "ST": 2,
    "LVH": 3
}, inplace=True)

df['ExerciseAngina'].replace({
    "N": 0,
    "Y": 1
}, inplace=True)

df['ST_Slope'].replace({
    "Down": 1,
    "Flat": 2,
    "Up": 3
}, inplace=True)

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0


In [8]:
# Convert all text to number

df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,1,0,1,0


# Apply Scaling

In [9]:
X = df.drop('HeartDisease', axis="columns")
y = df['HeartDisease']

In [10]:
# Standard Scalling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

## Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size = 0.25)

# Train a model using standalone support vector machine and then using bagging

In [12]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

0.8451327433628318

In [13]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score

bag_model = BaggingClassifier(estimator=SVC(), n_estimators=100, max_samples=0.8)
scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

0.6884162062615101

# Train a model using standalone Decision Tree Classifier and then using bagging

In [14]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_model.score(X_test, y_test)

0.7920353982300885

In [15]:
bag_model_for_tree = BaggingClassifier(estimator= DecisionTreeClassifier(), n_estimators=100, max_samples=0.8)
score_for_tree = cross_val_score(bag_model_for_tree, X, y, cv=5)
score_for_tree.mean()

0.7981399631675875

**Bootstrap Sampling:** Randomly select subsets (with replacement) from the training data. This process is known as bootstrap sampling.


**Base Model Training:**
Train a base model (e.g., decision tree, SVM, etc.) on each bootstrap sample independently.


**Predictions:** Make predictions using each base model.


**Aggregation:** Combine the predictions of all base models to make a final prediction. The most common aggregation method is averaging for regression problems and voting for classification problems.