In [1]:
%store -r new_def

In [2]:
X = new_def.drop(['smoking'], axis=1)
y = new_def['smoking']


In [3]:
X.head()


Unnamed: 0,waist_winsorized,Gtp_winsorized,triglyceride_winsorized,systolic_winsorized,hemoglobin_winsorized,height_winsorized
0,4.234107,2.564949,3.806662,4.663439,2.595255,5.081404
1,4.488636,4.488636,5.081404,4.912655,2.76001,5.081404
2,4.488636,3.367296,5.099866,4.859812,2.785011,5.170484
3,4.304065,2.564949,3.988984,4.85203,2.791165,5.170484
4,4.478473,3.555348,3.988984,4.875197,2.747271,5.141664


In [4]:
X.shape

(101923, 6)

In [5]:
y.head()

0    0
1    0
2    1
3    1
4    1
Name: smoking, dtype: int64

In [6]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier


class BaggingClassifierMe:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.models = [DecisionTreeClassifier(max_depth=self.max_depth) for _ in range(n_estimators)]

    def fit(self, X, y):
        for model in self.models:
            indices = np.random.choice(len(X), len(X), replace=True)
            X_subset, y_subset = X.iloc[indices], y.iloc[indices]
            model.fit(X_subset, y_subset)

    def predict(self, X):

        predictions = np.zeros((len(X), self.n_estimators))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)

        # Calculate the average prediction across all models
        avg_predictions = np.mean(predictions, axis=1)

        # Apply threshold to convert to binary predictions
        binary_predictions = (avg_predictions >= 0.5).astype(int)

        return binary_predictions


def score(self, X, y):
    predictions = self.predict(X)
    accuracy = np.mean(predictions == y)
    return accuracy


def get_params(self, deep=True):
    return {'n_estimators': self.n_estimators, 'max_depth': self.max_depth}


def set_params(self, **parameters):
    for parameter, value in parameters.items():
        setattr(self, parameter, value)
    return self


In [7]:
class AdaBoostClassifierMe:
    def __init__(self, n_estimators=100, max_depth=1):
        self.n_estimators = n_estimators
        self.models = []
        self.alphas = []
        self.max_depth = max_depth

    def fit(self, X, y):
        # Initialize weights
        weights = np.ones(len(X)) / len(X)

        for _ in range(self.n_estimators):
            # Create a new model
            model = DecisionTreeClassifier(max_depth=self.max_depth)

            # Fit the model with weighted samples
            model.fit(X, y, sample_weight=weights)

            # Predictions of the current model
            predictions = model.predict(X)

            # Calculate error and alpha
            error = np.sum(weights * (predictions != y))
            alpha = 0.5 * np.log((1 - error) / max(error, 1e-10))

            # Update weights
            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

            # Save the model and its corresponding alpha
            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X):
        # Initialize predictions
        predictions = np.zeros(len(X))

        for model, alpha in zip(self.models, self.alphas):
            # Accumulate weighted predictions
            predictions += alpha * model.predict(X)

        # Apply sign function to get final predictions
        return np.sign(predictions)

In [8]:
class RandomForestClassifierMe:
    def __init__(self, n_estimators=100, max_features=None, max_depth=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth  
        self.models = [DecisionTreeClassifier(max_features=max_features, max_depth=max_depth) for _ in range(n_estimators)]
        
    def fit(self, X, y):
        for model in self.models:
            indices = np.random.choice(len(X), len(X), replace=True)
            X_subset, y_subset = X.iloc[indices], y.iloc[indices]
            model.fit(X_subset, y_subset)
    
    def predict(self, X, threshold=0.5):
        predictions = np.zeros((len(X), self.n_estimators))
        for i, model in enumerate(self.models):
             predictions[:, i] = model.predict(X)
    
        # Calculate the average prediction across all models
        avg_predictions = np.mean(predictions, axis=1)
    
        # Apply threshold to convert to binary predictions
        binary_predictions = (avg_predictions >= threshold).astype(int)
    
        return binary_predictions

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and fit each model
bagging_model = BaggingClassifierMe(n_estimators=1000, max_depth=5)
bagging_model.fit(X_train, y_train)

boosting_model = AdaBoostClassifierMe(n_estimators=1000, max_depth=5)
boosting_model.fit(X_train, y_train)

random_forest_model = RandomForestClassifierMe(n_estimators=1000, max_depth=5)
random_forest_model.fit(X_train, y_train)

# Make predictions
bagging_predictions = bagging_model.predict(X_test)
boosting_predictions = boosting_model.predict(X_test)
random_forest_predictions = random_forest_model.predict(X_test)

# Evaluate the models
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)

print(f"Bagging Accuracy: {bagging_accuracy}")
print(f"Boosting Accuracy: {boosting_accuracy}")
print(f"Random Forest Accuracy: {random_forest_accuracy}")

Bagging Accuracy: 0.7535442727495708
Boosting Accuracy: 0.7492273730684327
Random Forest Accuracy: 0.7530537159676233


In [10]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_predictions = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
print(f"Bagging Accuracy: {bagging_accuracy}")

# Boosting (AdaBoost)
boosting_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100,
                                    random_state=42)
boosting_model.fit(X_train, y_train)
boosting_predictions = boosting_model.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
print(f"Boosting Accuracy: {boosting_accuracy}")

# Random Forest
random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)
print(f"Random Forest Accuracy: {random_forest_accuracy}")



Bagging Accuracy: 0.7429973019376993




Boosting Accuracy: 0.7554083885209713
Random Forest Accuracy: 0.7474123129752269


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# Define the parameter grid for the models
param_grid_bagging = {
    'base_estimator__max_depth': [None, 1, 3, 7],
    'n_estimators': [50, 100, 150],
    'max_samples': [1.0, 0.8, 0.6]  
    }

param_grid_boosting = {
    'n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [1, 3, 7]
}

param_grid_random_forest = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 1, 3, 7],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Grid search for Bagging
grid_search_bagging = GridSearchCV(BaggingClassifier(base_estimator=DecisionTreeClassifier()), param_grid_bagging, cv=3, n_jobs=-1)
grid_search_bagging.fit(X_train, y_train)

# Grid search for Boosting
grid_search_boosting = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), param_grid_boosting, cv=3, n_jobs=-1)
grid_search_boosting.fit(X_train, y_train)

# Grid search for Random Forest
grid_search_random_forest = GridSearchCV(RandomForestClassifier(), param_grid_random_forest, cv=3, n_jobs=-1)
grid_search_random_forest.fit(X_train, y_train)

# Get the best parameters
best_params_bagging = grid_search_bagging.best_params_
best_params_boosting = grid_search_boosting.best_params_
best_params_random_forest = grid_search_random_forest.best_params_

print("Best Parameters for Bagging:", best_params_bagging)
print("Best Parameters for Boosting:", best_params_boosting)
print("Best Parameters for Random Forest:", best_params_random_forest)
print("Best accuracy for Bagging:", grid_search_bagging.best_score_)
print("Best accuracy for Boosting:", grid_search_boosting.best_score_)
print("Best accuracy for Random Forest:", grid_search_random_forest.best_score_)


36 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/

Best Parameters for Bagging: {'base_estimator__max_depth': 7, 'max_samples': 0.6, 'n_estimators': 50}
Best Parameters for Boosting: {'base_estimator__max_depth': 3, 'n_estimators': 50}
Best Parameters for Random Forest: {'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 50}
Best accuracy for Bagging: 0.7579533880450614
Best accuracy for Boosting: 0.7564694045958432
Best accuracy for Random Forest: 0.7573769720943511


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier

# Define the parameter distributions for the models
param_dist_bagging = {
    'base_estimator__max_depth': [None, 1, 3, 5],
    'n_estimators': randint(50, 150),
    'max_samples': uniform(0.6, 0.4)  # You can adjust this parameter as needed
}

param_dist_boosting = {
    'n_estimators': randint(50, 150),
    'base_estimator__max_depth': [1, 3, 5]
}

param_dist_random_forest = {
    'n_estimators': randint(50, 150),
    'max_depth': [None, 1, 3, 5],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Randomized search for Bagging
random_search_bagging = RandomizedSearchCV(BaggingClassifier(base_estimator=DecisionTreeClassifier()),
                                           param_dist_bagging, n_iter=10, cv=3, random_state=42)
random_search_bagging.fit(X_train, y_train)

# Randomized search for Boosting
random_search_boosting = RandomizedSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
                                            param_dist_boosting, n_iter=10, cv=3, random_state=42)
random_search_boosting.fit(X_train, y_train)

# Randomized search for Random Forest
random_search_random_forest = RandomizedSearchCV(RandomForestClassifier(), param_dist_random_forest, n_iter=10, cv=3,
                                                 random_state=42)
random_search_random_forest.fit(X_train, y_train)

# Get the best parameters
best_params_random_search_bagging = random_search_bagging.best_params_
best_params_random_search_boosting = random_search_boosting.best_params_
best_params_random_search_random_forest = random_search_random_forest.best_params_

print("Best Parameters for Bagging (Randomized Search):", best_params_random_search_bagging)
print("Best Parameters for Boosting (Randomized Search):", best_params_random_search_boosting)
print("Best Parameters for Random Forest (Randomized Search):", best_params_random_search_random_forest)
print("best accuracy for bagging:", random_search_bagging.best_score_)
print("best accuracy for boosting:", random_search_boosting.best_score_)
print("best accuracy for random forest:", random_search_random_forest.best_score_)

12 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/V

Best Parameters for Bagging (Randomized Search): {'base_estimator__max_depth': 5, 'max_samples': 0.8832290311184181, 'n_estimators': 71}
Best Parameters for Boosting (Randomized Search): {'base_estimator__max_depth': 1, 'n_estimators': 149}
Best Parameters for Random Forest (Randomized Search): {'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 87}
best accuracy for bagging: 0.7500061579053711
best accuracy for boosting: 0.7555250877540999
best accuracy for random forest: 0.7535382533711249


In [13]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from bayes_opt import BayesianOptimization

# Load your dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the function to optimize
def optimize_bagging(n_estimators, max_samples):
    model = BaggingClassifier(n_estimators=int(n_estimators), max_samples=max_samples)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

def optimize_adaboost(n_estimators, learning_rate):
    model = AdaBoostClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

def optimize_rf(n_estimators, max_depth):
    model = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth))
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

# Define the search space
pbounds_bagging = {'n_estimators': (10, 1000), 'max_samples': (0.1, 1.0)}
pbounds_adaboost = {'n_estimators': (10, 1000), 'learning_rate': (0.01, 1.0)}
pbounds_rf = {'n_estimators': (10, 1000), 'max_depth': (1, 100)}

# Perform optimization
optimizer_bagging = BayesianOptimization(f=optimize_bagging, pbounds=pbounds_bagging, random_state=42)
optimizer_adaboost = BayesianOptimization(f=optimize_adaboost, pbounds=pbounds_adaboost, random_state=42)
optimizer_rf = BayesianOptimization(f=optimize_rf, pbounds=pbounds_rf, random_state=42)

optimizer_bagging.maximize(init_points=10, n_iter=50)
optimizer_adaboost.maximize(init_points=10, n_iter=50)
optimizer_rf.maximize(init_points=10, n_iter=50)

# Get the best hyperparameters
best_params_bagging = optimizer_bagging.max
best_params_adaboost = optimizer_adaboost.max
best_params_rf = optimizer_rf.max

print("Best Bagging Classifier Parameters:", best_params_bagging)
print("Best AdaBoost Classifier Parameters:", best_params_adaboost)
print("Best Random Forest Classifier Parameters:", best_params_rf)
print("best accuracy for bagging:", optimizer_bagging.max['target'])
print("best accuracy for boosting:", optimizer_adaboost.max['target'])
print("best accuracy for random forest:", optimizer_rf.max['target'])


|   iter    |  target   | max_sa... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m0.905    [0m | [0m0.4371   [0m | [0m951.2    [0m |
| [0m2        [0m | [0m0.905    [0m | [0m0.7588   [0m | [0m602.7    [0m |
| [0m3        [0m | [0m0.88     [0m | [0m0.2404   [0m | [0m164.4    [0m |
| [0m4        [0m | [0m0.865    [0m | [0m0.1523   [0m | [0m867.5    [0m |
| [0m5        [0m | [0m0.905    [0m | [0m0.641    [0m | [0m711.0    [0m |
| [0m6        [0m | [0m0.865    [0m | [0m0.1185   [0m | [0m970.2    [0m |
| [0m7        [0m | [0m0.905    [0m | [0m0.8492   [0m | [0m220.2    [0m |
| [0m8        [0m | [0m0.895    [0m | [0m0.2636   [0m | [0m191.6    [0m |
| [0m9        [0m | [0m0.895    [0m | [0m0.3738   [0m | [0m529.5    [0m |
| [0m10       [0m | [0m0.895    [0m | [0m0.4888   [0m | [0m298.3    [0m |
| [0m11       [0m | [0m0.89     [0m | [0m0.2472   [0m | [0m712.3    [0m 