In [1]:
%store -r new_def

In [2]:
X = new_def.drop(['smoking'], axis=1)
y = new_def['smoking']


In [3]:
X.head()


Unnamed: 0,waist_winsorized,triglyceride_winsorized,systolic_winsorized,hemoglobin_winsorized,bmi_winsorized,lipid_interaction_winsorized,gtp_hemoglobin_interaction_winsorized,age_GTP_winsorized,age_ALT_winsorized,age_hemoglobin_winsorized
0,-1.959592,-1.985089,-1.567791,-2.023424,-1.602205,0.816468,-1.673359,0.309747,1.759991,-0.264883
1,0.640314,0.729294,1.081192,0.039374,0.306234,0.080549,1.796783,0.520685,-0.962108,1.415253
2,0.640314,0.762765,0.532371,0.341097,0.012828,-0.518464,0.035813,-0.378217,0.356409,-0.391491
3,-1.227953,-1.539342,0.450977,0.414927,-1.582471,0.464821,-1.482936,0.157188,0.218803,-0.394433
4,0.539687,-1.539342,0.692852,-0.11547,-0.074573,2.094233,0.294813,0.750875,0.161467,0.97515


In [4]:
X.shape

(101923, 10)

In [5]:
y.head()

0    0
1    0
2    1
3    1
4    1
Name: smoking, dtype: int64

In [6]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier


class BaggingClassifierMe:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.models = [DecisionTreeClassifier(max_depth=self.max_depth) for _ in range(n_estimators)]

    def fit(self, X, y):
        for model in self.models:
            indices = np.random.choice(len(X), len(X), replace=True)
            X_subset, y_subset = X.iloc[indices], y.iloc[indices]
            model.fit(X_subset, y_subset)

    def predict(self, X):

        predictions = np.zeros((len(X), self.n_estimators))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)

        # Calculate the average prediction across all models
        avg_predictions = np.mean(predictions, axis=1)

        # Apply threshold to convert to binary predictions
        binary_predictions = (avg_predictions >= 0.5).astype(int)

        return binary_predictions

    def score(self, X, y):

        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
        }

    def set_params(self, **params):
        if not params:
            return self

        for param, value in params.items():
          setattr(self, param, value)

        return self

In [7]:
class AdaBoostClassifierMe:
    def __init__(self, n_estimators=100, max_depth=1):
        self.n_estimators = n_estimators
        self.models = []
        self.alphas = []
        self.max_depth = max_depth

    def fit(self, X, y):
        # Initialize weights
        weights = np.ones(len(X)) / len(X)

        for _ in range(self.n_estimators):
            # Create a new model
            model = DecisionTreeClassifier(max_depth=self.max_depth)

            # Fit the model with weighted samples
            model.fit(X, y, sample_weight=weights)

            # Predictions of the current model
            predictions = model.predict(X)

            # Calculate error and alpha
            error = np.sum(weights * (predictions != y))
            alpha = 0.5 * np.log((1 - error) / max(error, 1e-10))

            # Update weights
            weights *= np.exp(-alpha * y * predictions)
            weights /= np.sum(weights)

            # Save the model and its corresponding alpha
            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X):
        # Initialize predictions
        predictions = np.zeros(len(X))

        for model, alpha in zip(self.models, self.alphas):
            # Accumulate weighted predictions
            predictions += alpha * model.predict(X)

        # Apply sign function to get final predictions
        return np.sign(predictions)

    def score(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
        }

    def set_params(self, **params):
        if not params:
            return self

        for param, value in params.items():
            setattr(self, param, value)

        return self

In [8]:
class RandomForestClassifierMe:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.models = []

    def fit(self, x, y):
        x = np.array(x)
        y = np.array(y)
        n_samples, n_features = x.shape
        for _ in range(self.n_estimators):
            # Randomly select a subset of features
            selected_features = np.random.choice(n_features, size=int(np.sqrt(n_features)), replace=False)
            x_subset = x[:, selected_features]

            # Create a decision tree with random features
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf
            )
            tree.fit(x_subset, y)
            self.models.append((tree, selected_features))
        return self

    def predict(self, x):
        x = np.array(x)
        pred = np.zeros((x.shape[0], self.n_estimators))
        for i, (tree, selected_features) in enumerate(self.models):
            x_subset = x[:, selected_features]
            pred[:, i] = tree.predict(x_subset)

        # Use majority voting for the final prediction
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x.astype(int)).argmax(), axis=1, arr=pred)
        return final_predictions

    def score(self, X, y):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y)
        return accuracy

    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf
        }

    def set_params(self, **params):
        if not params:
            return self

        for param, value in params.items():
            setattr(self, param, value)

        return self


In [9]:
#split the data into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
#bagging model
from sklearn.metrics import accuracy_score
bagging_model = BaggingClassifierMe(n_estimators=1000, max_depth=5)
bagging_model.fit(X_train, y_train)
bagging_predictions = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
print(f"Bagging Accuracy: {bagging_accuracy}")


Bagging Accuracy: 0.7432916360068678


In [11]:
#boosting
boosting_model = AdaBoostClassifierMe(n_estimators=1000, max_depth=5)
boosting_model.fit(X_train, y_train)
boosting_predictions = boosting_model.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
print(f"Boosting Accuracy: {boosting_accuracy}")

Boosting Accuracy: 0.7413784645572725


In [12]:
#random forest model
random_forest_model = RandomForestClassifierMe(n_estimators=1000, max_depth=5, min_samples_split=5, min_samples_leaf=2)
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)
print(f"Random Forest Accuracy: {random_forest_accuracy}")

Random Forest Accuracy: 0.7403482953151828


# use grid search for our classifier

In [13]:
from sklearn.model_selection import GridSearchCV
import multiprocessing
n_porcess = multiprocessing.cpu_count()-1

In [14]:
param_grid_bagging = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [1, 4, 7]
}
param_grid_boosting = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [1, 4, 7]
}
param_grid_random_forest = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [1, 4, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}

# bagging with grid search

In [15]:
bag_model = BaggingClassifierMe()
bag_grid = GridSearchCV(estimator=bag_model, param_grid=param_grid_bagging, cv=3, scoring='accuracy',n_jobs=n_porcess)
bag_grid.fit(X_train, y_train)
print("Best Parameters for Bagging:", bag_grid.best_params_)

Best Parameters for Bagging: {'max_depth': 7, 'n_estimators': 100}


In [23]:
print("Best Score for Bagging:", bag_grid.best_score_)

Best Score for Bagging: 0.7526797600511506


# ada boost with grid search

In [16]:
adaBoost_model = AdaBoostClassifierMe()
adaBoost_grid = GridSearchCV(estimator=adaBoost_model, param_grid=param_grid_boosting, cv=3, scoring='accuracy',n_jobs=n_porcess)
adaBoost_grid.fit(X_train, y_train)
print("Best Parameters for Boosting:", adaBoost_grid.best_params_)

Best Parameters for Boosting: {'max_depth': 7, 'n_estimators': 100}


In [24]:
print('best score for boosting:', adaBoost_grid.best_score_)

best score for boosting: 0.7399985378415356


# Random forest with grid search

In [17]:

randomForest_model = RandomForestClassifierMe()
randomForest_grid = GridSearchCV(estimator=randomForest_model, param_grid=param_grid_random_forest, cv=3,
                                 scoring='accuracy',n_jobs=n_porcess)
randomForest_grid.fit(X_train, y_train)
print("Best Parameters for Random Forest:", randomForest_grid.best_params_)



Best Parameters for Random Forest: {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 400}


In [25]:
print('best score for boosting:',randomForest_grid.best_score_) 

best score for boosting: 0.7440089654265041


# randomized search 

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
param_dist_bagging = {
    'max_depth': [None, 1, 3, 5],
    'n_estimators': randint(50, 1000),
}

param_dist_boosting = {
    'n_estimators': randint(50, 1000),
    'max_depth': [1, 3, 5]
}

param_dist_random_forest = {
    'n_estimators': randint(50, 1000),
    'max_depth': [None, 1, 3, 5],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

## bagging


In [19]:
random_search_bagging = RandomizedSearchCV(BaggingClassifierMe(),
                                           param_dist_bagging, n_iter=10, cv=3, random_state=42,n_jobs=n_porcess)
random_search_bagging.fit(X_train, y_train)
print("Best Parameters for Bagging:", random_search_bagging.best_params_)
print("Best Score for Bagging:", random_search_bagging.best_score_)
print("Test Score for Bagging:", random_search_bagging.score(X_test, y_test))

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/t2/cpcjvzq116v7wy6ywq0x6c500000gn/T/ipykernel_61159/1170468713.py", line 33, in score
  File "/var/folders/t2/cpcjvzq116v7wy6ywq0x6c500000gn/T/ipykernel_61159/1170468713.py", line 21, in predict
IndexError: index 70 is out of bounds for axis 1 with size 70

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X

Best Parameters for Bagging: {'max_depth': 3, 'n_estimators': 121}
Best Score for Bagging: 0.7327626529388018
Test Score for Bagging: 0.7332842776551386


## adaboost

In [20]:
random_search_boosting = RandomizedSearchCV(AdaBoostClassifierMe(),
                                            param_dist_boosting, n_iter=10, cv=3, random_state=42,n_jobs=n_porcess)
random_search_boosting.fit(X_train, y_train)
print("Best Parameters for Boosting:", random_search_boosting.best_params_)
print("Best Score for Boosting:", random_search_boosting.best_score_)
print("Test Score for Boosting:", random_search_boosting.score(X_test, y_test))

Best Parameters for Boosting: {'max_depth': 5, 'n_estimators': 485}
Best Score for Boosting: 0.7367730385595879
Test Score for Boosting: 0.7413784645572725


## random forest

In [21]:
random_search_random_forest = RandomizedSearchCV(RandomForestClassifierMe(), param_dist_random_forest, n_iter=10, cv=3,
                                                 random_state=42,n_jobs=n_porcess)
random_search_random_forest.fit(X_train, y_train)
print("Best Parameters for Random Forest:", random_search_random_forest.best_params_)
print("Best Score for Random Forest:", random_search_random_forest.best_score_)
print("Test Score for Random Forest:", random_search_random_forest.score(X_test, y_test))


Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 541}
Best Score for Random Forest: 0.7438618069652475
Test Score for Random Forest: 0.7451557517782683


# bayasian method for our classifier

In [40]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization


# Define the function to optimize
def optimize_bagging(n_estimators, max_depth):
    model = BaggingClassifierMe(n_estimators=int(n_estimators), max_depth=int(max_depth))
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


def optimize_adaboost(n_estimators, max_depth):
    model = AdaBoostClassifierMe(n_estimators=int(n_estimators), max_depth=int(max_depth))
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


def optimize_rf(n_estimators, max_depth, min_samples_leaf, min_samples_split):
    model = RandomForestClassifierMe(n_estimators=int(n_estimators), max_depth=int(max_depth),
                                     min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

# Define the search space
pbounds_bagging = {'n_estimators': (10, 1000), 'max_depth': (1, 100)}
pbounds_adaboost = {'n_estimators': (10, 1000), 'max_depth': (1, 100)}
pbounds_rf = {'n_estimators': (10, 1000), 'max_depth': (1, 100),
              'min_samples_split': (0.001,1.0),'min_samples_leaf': (0.001,1.0)}

## baging baysien optimizer

In [32]:
optimizer_bagging = BayesianOptimization(f=optimize_bagging, pbounds=pbounds_bagging, random_state=42)
optimizer_bagging.maximize(init_points=10, n_iter=10)
best_params_bagging = optimizer_bagging.max
print("Best Bagging Classifier Parameters:", best_params_bagging)
print("best accuracy for bagging:", optimizer_bagging.max['target'])

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m0.7533   [0m | [0m38.08    [0m | [0m951.2    [0m |
| [0m2        [0m | [0m0.7525   [0m | [0m73.47    [0m | [0m602.7    [0m |
| [95m3        [0m | [95m0.757    [0m | [95m16.45    [0m | [95m164.4    [0m |
| [0m4        [0m | [0m0.7507   [0m | [0m6.75     [0m | [0m867.5    [0m |
| [0m5        [0m | [0m0.7532   [0m | [0m60.51    [0m | [0m711.0    [0m |
| [0m6        [0m | [0m0.7309   [0m | [0m3.038    [0m | [0m970.2    [0m |
| [0m7        [0m | [0m0.7524   [0m | [0m83.41    [0m | [0m220.2    [0m |
| [0m8        [0m | [0m0.7546   [0m | [0m19.0     [0m | [0m191.6    [0m |
| [0m9        [0m | [0m0.7519   [0m | [0m31.12    [0m | [0m529.5    [0m |
| [0m10       [0m | [0m0.7524   [0m | [0m43.76    [0m | [0m298.3    [0m |
| [0m11       [0m | [0m0.6907   [0m | [0m1.024    [0m | [0m176.8    

In [33]:
#adaboost
optimizer_adaboost = BayesianOptimization(f=optimize_adaboost, pbounds=pbounds_adaboost, random_state=42)
optimizer_adaboost.maximize(init_points=10, n_iter=10)
best_params_adaboost = optimizer_adaboost.max
print("Best AdaBoost Classifier Parameters:", best_params_adaboost)
print("best accuracy for boosting:", optimizer_adaboost.max['target'])


|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m0.6149   [0m | [0m38.08    [0m | [0m951.2    [0m |
| [95m2        [0m | [95m0.629    [0m | [95m73.47    [0m | [95m602.7    [0m |
| [0m3        [0m | [0m0.5902   [0m | [0m16.45    [0m | [0m164.4    [0m |
| [95m4        [0m | [95m0.7433   [0m | [95m6.75     [0m | [95m867.5    [0m |
| [0m5        [0m | [0m0.6102   [0m | [0m60.51    [0m | [0m711.0    [0m |
| [0m6        [0m | [0m0.7253   [0m | [0m3.038    [0m | [0m970.2    [0m |
| [0m7        [0m | [0m0.629    [0m | [0m83.41    [0m | [0m220.2    [0m |
| [0m8        [0m | [0m0.5966   [0m | [0m19.0     [0m | [0m191.6    [0m |
| [0m9        [0m | [0m0.5916   [0m | [0m31.12    [0m | [0m529.5    [0m |
| [0m10       [0m | [0m0.6405   [0m | [0m43.76    [0m | [0m298.3    [0m |
| [0m11       [0m | [0m0.5761   [0m | [0m19.16    [0m | [0m856.4 

In [41]:

#random forest
optimizer_rf = BayesianOptimization(f=optimize_rf, pbounds=pbounds_rf, random_state=42)
optimizer_rf.maximize(init_points=10, n_iter=10)
best_params_rf = optimizer_rf.max
print("Best Random Forest Classifier Parameters:", best_params_rf)
print("best accuracy for random forest:", optimizer_rf.max['target'])


|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.5633   [0m | [0m38.08    [0m | [0m0.9508   [0m | [0m0.7323   [0m | [0m602.7    [0m |
| [95m2        [0m | [95m0.7273   [0m | [95m16.45    [0m | [95m0.1568   [0m | [95m0.05903  [0m | [95m867.5    [0m |
| [0m3        [0m | [0m0.5633   [0m | [0m60.51    [0m | [0m0.7084   [0m | [0m0.02156  [0m | [0m970.2    [0m |
| [0m4        [0m | [0m0.72     [0m | [0m83.41    [0m | [0m0.2131   [0m | [0m0.1826   [0m | [0m191.6    [0m |
| [0m5        [0m | [0m0.5633   [0m | [0m31.12    [0m | [0m0.5252   [0m | [0m0.4325   [0m | [0m298.3    [0m |
| [0m6        [0m | [0m0.7212   [0m | [0m61.57    [0m | [0m0.1404   [0m | [0m0.2929   [0m | [0m372.7    [0m |
| [0m7        [0m | [0m0.5633   [0m | [0m46.15    [0m | [0m0.7854   [0m | [0m0.2005   [0m | [0m519.1   

# sklearn ensamble methode {training and make hyperparmeter}

In [42]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_predictions = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
print(f"Bagging Accuracy: {bagging_accuracy}")

# Boosting (AdaBoost)
boosting_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100,
                                    random_state=42)
boosting_model.fit(X_train, y_train)
boosting_predictions = boosting_model.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
print(f"Boosting Accuracy: {boosting_accuracy}")

# Random Forest
random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
random_forest_model.fit(X_train, y_train)
random_forest_predictions = random_forest_model.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, random_forest_predictions)
print(f"Random Forest Accuracy: {random_forest_accuracy}")



Bagging Accuracy: 0.7501103752759382




Boosting Accuracy: 0.7565857247976453
Random Forest Accuracy: 0.7545253863134658


# Grid Search for sklearn ensamble methode

In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
# Define the parameter grid for the models
param_grid_bagging = {
    'base_estimator__max_depth': [None, 1, 3, 7],
    'n_estimators': [50, 100, 150],
    'max_samples': [1.0, 0.8, 0.6]
}

param_grid_boosting = {
    'n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [1, 3, 7]
}

param_grid_random_forest = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 1, 3, 7],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Grid search for Bagging
grid_search_bagging = GridSearchCV(BaggingClassifier(base_estimator=DecisionTreeClassifier()), param_grid_bagging, cv=3,
                                   n_jobs=n_porcess)
grid_search_bagging.fit(X_train, y_train)

# Grid search for Boosting
grid_search_boosting = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), param_grid_boosting,
                                    cv=3, n_jobs=n_porcess)
grid_search_boosting.fit(X_train, y_train)

# Grid search for Random Forest
grid_search_random_forest = GridSearchCV(RandomForestClassifier(), param_grid_random_forest, cv=3, n_jobs=n_porcess)
grid_search_random_forest.fit(X_train, y_train)

# Get the best parameters
best_params_bagging = grid_search_bagging.best_params_
best_params_boosting = grid_search_boosting.best_params_
best_params_random_forest = grid_search_random_forest.best_params_

print("Best Parameters for Bagging:", best_params_bagging)
print("Best Parameters for Boosting:", best_params_boosting)
print("Best Parameters for Random Forest:", best_params_random_forest)
print("Best accuracy for Bagging:", grid_search_bagging.best_score_)
print("Best accuracy for Boosting:", grid_search_boosting.best_score_)
print("Best accuracy for Random Forest:", grid_search_random_forest.best_score_)


36 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
23 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/

Best Parameters for Bagging: {'base_estimator__max_depth': None, 'max_samples': 0.6, 'n_estimators': 100}
Best Parameters for Boosting: {'base_estimator__max_depth': 3, 'n_estimators': 50}
Best Parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 150}
Best accuracy for Bagging: 0.755377882365155
Best accuracy for Boosting: 0.7583703820661941
Best accuracy for Random Forest: 0.7558807170547123


# Randomized Search for sklearn ensamble methode

In [49]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier

# Define the parameter distributions for the models
param_dist_bagging = {
    'base_estimator__max_depth': [None, 1, 3, 5],
    'n_estimators': randint(50, 150),
    'max_samples': uniform(0.6, 0.4)  
}

param_dist_boosting = {
    'n_estimators': randint(50, 150),
    'base_estimator__max_depth': [1, 3, 5]
}

param_dist_random_forest = {
    'n_estimators': randint(50, 150),
    'max_depth': [None, 1, 3, 5],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Randomized search for Bagging
random_search_bagging = RandomizedSearchCV(BaggingClassifier(base_estimator=DecisionTreeClassifier()),
                                           param_dist_bagging, n_iter=10, cv=3, random_state=42,n_jobs=n_porcess)
random_search_bagging.fit(X_train, y_train)

# Randomized search for Boosting
random_search_boosting = RandomizedSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
                                            param_dist_boosting, n_iter=10, cv=3, random_state=42,n_jobs=n_porcess)
random_search_boosting.fit(X_train, y_train)

# Randomized search for Random Forest
random_search_random_forest = RandomizedSearchCV(RandomForestClassifier(), param_dist_random_forest, n_iter=10, cv=3,
                                                 random_state=42,n_jobs=n_porcess)
random_search_random_forest.fit(X_train, y_train)

# Get the best parameters
best_params_random_search_bagging = random_search_bagging.best_params_
best_params_random_search_boosting = random_search_boosting.best_params_
best_params_random_search_random_forest = random_search_random_forest.best_params_

print("Best Parameters for Bagging (Randomized Search):", best_params_random_search_bagging)
print("Best Parameters for Boosting (Randomized Search):", best_params_random_search_boosting)
print("Best Parameters for Random Forest (Randomized Search):", best_params_random_search_random_forest)
print("best accuracy for bagging:", random_search_bagging.best_score_)
print("best accuracy for boosting:", random_search_boosting.best_score_)
print("best accuracy for random forest:", random_search_random_forest.best_score_)

12 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Ve

Best Parameters for Bagging (Randomized Search): {'base_estimator__max_depth': None, 'max_samples': 0.9879639408647978, 'n_estimators': 79}
Best Parameters for Boosting (Randomized Search): {'base_estimator__max_depth': 1, 'n_estimators': 149}
Best Parameters for Random Forest (Randomized Search): {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 137}
best accuracy for bagging: 0.7527901244976222
best accuracy for boosting: 0.7578430073543899
best accuracy for random forest: 0.7555127908949238


# Bayesian method for sklearn ensamble

In [50]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from bayes_opt import BayesianOptimization

# Define the function to optimize
def optimize_bagging(n_estimators, max_samples):
    model = BaggingClassifier(n_estimators=int(n_estimators), max_samples=max_samples,n_jobs=n_porcess)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


def optimize_adaboost(n_estimators, learning_rate):
    model = AdaBoostClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


def optimize_rf(n_estimators, max_depth):
    model = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth),n_jobs=n_porcess)
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


# Define the search space
pbounds_bagging = {'n_estimators': (10, 1000), 'max_samples': (0.1, 1.0)}
pbounds_adaboost = {'n_estimators': (10, 1000), 'learning_rate': (0.01, 1.0)}
pbounds_rf = {'n_estimators': (10, 1000), 'max_depth': (1, 100)}

# Perform optimization
optimizer_bagging = BayesianOptimization(f=optimize_bagging, pbounds=pbounds_bagging, random_state=42)
optimizer_adaboost = BayesianOptimization(f=optimize_adaboost, pbounds=pbounds_adaboost, random_state=42)
optimizer_rf = BayesianOptimization(f=optimize_rf, pbounds=pbounds_rf, random_state=42)

optimizer_bagging.maximize(init_points=10, n_iter=15)
optimizer_adaboost.maximize(init_points=10, n_iter=15)
optimizer_rf.maximize(init_points=10, n_iter=15)

# Get the best hyperparameters
best_params_bagging = optimizer_bagging.max
best_params_adaboost = optimizer_adaboost.max
best_params_rf = optimizer_rf.max

print("Best Bagging Classifier Parameters:", best_params_bagging)
print("Best AdaBoost Classifier Parameters:", best_params_adaboost)
print("Best Random Forest Classifier Parameters:", best_params_rf)
print("best accuracy for bagging:", optimizer_bagging.max['target'])
print("best accuracy for boosting:", optimizer_adaboost.max['target'])
print("best accuracy for random forest:", optimizer_rf.max['target'])


|   iter    |  target   | max_sa... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m0.7587   [0m | [0m0.4371   [0m | [0m951.2    [0m |
| [0m2        [0m | [0m0.7535   [0m | [0m0.7588   [0m | [0m602.7    [0m |
| [95m3        [0m | [95m0.7597   [0m | [95m0.2404   [0m | [95m164.4    [0m |
| [95m4        [0m | [95m0.7622   [0m | [95m0.1523   [0m | [95m867.5    [0m |
| [0m5        [0m | [0m0.7557   [0m | [0m0.641    [0m | [0m711.0    [0m |
| [0m6        [0m | [0m0.7619   [0m | [0m0.1185   [0m | [0m970.2    [0m |
| [0m7        [0m | [0m0.754    [0m | [0m0.8492   [0m | [0m220.2    [0m |
| [0m8        [0m | [0m0.7577   [0m | [0m0.2636   [0m | [0m191.6    [0m |
| [0m9        [0m | [0m0.759    [0m | [0m0.3738   [0m | [0m529.5    [0m |
| [0m10       [0m | [0m0.7542   [0m | [0m0.4888   [0m | [0m298.3    [0m |
| [0m11       [0m | [0m0.7549   [0m | [0m0.7136   [0m | [0m993.6 