bagging involves training a large number of predictors on random subsets of the training data. the sampling is performed with replacement.

In [4]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier
import matplotlib.pyplot as plt

In [2]:
titanic = sns.load_dataset('titanic')

# Drop rows with missing values for simplicity
titanic = titanic.dropna(subset=['age', 'embarked'])

# Convert categorical variables to dummy/indicator variables
titanic = pd.get_dummies(titanic, columns=['sex', 'embarked'], drop_first=True)

X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']]
y = titanic['survived']

In [8]:
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    n_jobs=-1,
    oob_score=True,
    random_state=42
)
bag_clf.fit(X_train, y_train)
print(f"OOB score: {bag_clf.oob_score_}")

y_pred = bag_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

OOB score: 0.7996485061511424
Accuracy: 0.79
