Bagging:
- Bagging Regressor
- Random Forest
- Extra trees(Extremely Randomized Trees)
- Isolation Forest

Boosting:
- Adaboost
- Gradient Boostinhg Machine
- Histogram GBM


#### BaggingRegressor

In [None]:
import sklearn.ensemble

bagging_regressor = sklearn.ensemble.BaggingRegressor(
    n_estimators=10,                    # Number of base estimators in the ensemble
    max_samples=1.0,                     # The fraction of samples to train each base estimator on (can be an integer for absolute count)
    max_features=1.0,                    # The fraction of features to train each base estimator on (can be an integer for absolute count)
    bootstrap=True,                      # Whether samples are drawn with replacement
    bootstrap_features=False,            # Whether features are drawn with replacement
    oob_score=False,                     # Whether to use out-of-bag samples to estimate generalization accuracy
    warm_start=False,                    # Whether to reuse previous model’s results to fit new estimators
    n_jobs=None,                         # Number of jobs to run in parallel (None means 1)
    random_state=None,                   # Seed used by the random number generator (for reproducibility)
    verbose=0                            # Verbosity level. 0 means silent, higher values give more information
)


param_grid = {
    "n_estimators": [10, 50, 100, 200],         
    "max_samples": [0.5, 1.0, 0.8, 0.6],        
    "max_features": [0.5, 1.0, 0.8, 0.6],       
    "bootstrap": [True, False],                  
    "bootstrap_features": [True, False],         
    "oob_score": [True, False],                
    "warm_start": [True, False],                
    "n_jobs": [None, 1, 2, 4],                  
    "random_state": [None, 42, 123],            
    "verbose": [0, 1, 2]                        
}


#### Random Forest

In [None]:
import sklearn.ensemble

# Initialize a RandomForestRegressor model with default parameters
randomforest_model = sklearn.ensemble.RandomForestRegressor(
    n_estimators=100,                      # Number of trees in the forest; higher values may improve accuracy but increase computation time.
    criterion="squared_error",            # Loss function to measure split quality; 'squared_error' for regression tasks.
    max_depth=None,                        # Maximum depth of the tree; smaller values prevent overfitting.
    min_samples_split=2,                   # Minimum samples required to split an internal node; larger values increase model generalization.
    min_samples_leaf=1,                    # Minimum samples required in a leaf node; larger values smooth the model.
   
    min_weight_fraction_leaf=0,            # Minimum weighted fraction of the sum of weights for leaf nodes; used for imbalanced data.
    max_features=1,                        # Number of features considered for the best split; 1 for all features, 'sqrt' or 'log2' for subsets.
    max_leaf_nodes=None,                   # Maximum number of leaf nodes; limits complexity and avoids overfitting.
    min_impurity_decrease=0,               # Minimum impurity decrease required for a split; larger values create simpler trees.
    bootstrap=True,                        # Whether to use bootstrap samples; True enables bagging, improving robustness.
    oob_score=False,                       # Whether to use out-of-bag samples to estimate generalization error; useful for validation.
    n_jobs=None,                           # Number of jobs to run in parallel; -1 uses all processors.
    random_state=None,                     # Seed for random number generation; ensures reproducibility.
    verbose=0,                             # Controls verbosity when fitting and predicting; higher values print more info.
    warm_start=False,                      # Reuse solution of the previous call to fit; useful for adding estimators incrementally.
    ccp_alpha=0,                           # Complexity parameter for Minimal Cost-Complexity Pruning; larger values simplify the model.
    max_samples=None                       # Number or fraction of samples to draw for each base estimator; limits training data.
)

# Hyperparameter tuning dictionary
rf_hyperparam_grid = {
    'n_estimators': [50, 100, 200],                    # Number of trees to evaluate.
    'max_depth': [None, 10, 20, 30],                  # Different depths to test tree complexity.
    'min_samples_split': [2, 5, 10],                 # Adjusts minimum samples required to split.
    'min_samples_leaf': [1, 2, 4],                   # Tests minimum samples per leaf.
    'max_features': ['sqrt', 'log2', 0.5],           # Explore feature subsets for splitting.
    'bootstrap': [True, False],                      # Tests bootstrap and non-bootstrap methods.
    'oob_score': [True, False],                      # Includes out-of-bag scoring in evaluation.
    'random_state': [42]                             # Ensures reproducibility during tuning.
}


### Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

isolationforest_model = IsolationForest(
    n_estimators=100,              # Number of trees in the forest
    max_samples="auto",            # Number of samples to draw for each tree ('auto' uses all samples)
    contamination="auto",          # Proportion of outliers (or 'auto' for automatic estimation)
    max_features=1.0,              # Proportion of features to consider at each split
    bootstrap=False,               # Whether to use bootstrap samples
    n_jobs=None,                   # Number of parallel jobs (-1 uses all processors)
    random_state=None,             # Seed for reproducibility
    verbose=0,                     # Verbosity level of output
    warm_start=False               # If True, reuse trees from previous fit
)

param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees; Small values (50) reduce computation but may lower accuracy; Large values (150) improve model robustness at the cost of computation.

    'max_samples': [0.5, 0.75, 'auto'],  # Fraction of samples per tree; Small values (0.5) increase randomness but may lower stability; Larger values (0.75) provide more data per tree, improving stability.

    'contamination': [0.05, 0.1, 0.2],  # Proportion of anomalies in data; Small values (0.05) assume fewer anomalies, possibly missing some; Larger values (0.2) increase anomaly detection but might misclassify normal points.

    'max_features': [0.5, 0.75, 1.0],  # Fraction of features to use at each split; Small values (0.5) add randomness and reduce overfitting but may miss important features; Larger values (1.0) improve accuracy but may risk overfitting.

    'bootstrap': [False, True],  # Whether to use bootstrapping; False uses unique subsets for each tree, increasing diversity; True uses overlapping subsets, which can improve performance on small datasets.

    'random_state': [42]  # Random seed for reproducibility; A fixed value (42) ensures consistent results across runs, useful for debugging and comparison.
}

grid_search = GridSearchCV(
    estimator=IsolationForest(),
    param_grid=param_grid,
    scoring='accuracy',  # Use an appropriate metric for your task
    cv=3,                # 3-fold cross-validation
    n_jobs=-1,           # Use all processors
    verbose=1            # Show progress
)

# Fit the grid search to the data
grid_search.fit(X)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


#### Extra Tree

In [None]:
import sklearn.tree
from sklearn.model_selection import GridSearchCV


extratree_model = sklearn.tree.ExtraTreeRegressor(
    criterion="squared_error",              # The function to measure the quality of a split. Options: 'squared_error' (default), 'friedman_mse', 'absolute_error', 'poisson'.
    splitter="random",                      # Strategy used to split at each node. 'random' (default) for random splits, 'best' for best split.
    max_depth=None,                         # The maximum depth of the tree. None means the tree expands until all leaves are pure or min_samples_split is reached.
    min_samples_split=2,                    # Minimum number of samples required to split a node. Small values make the tree more flexible.
    min_samples_leaf=1,                     # Minimum number of samples required to be a leaf node. Larger values prevent overfitting.
   
    min_weight_fraction_leaf=0,             # Minimum weighted fraction of input samples required to be at a leaf node. Useful for balancing class distribution.
    max_features=1,                         # Number of features to consider for splits. 'auto', 'sqrt', 'log2' or specific float/int values. Higher values make splits more flexible.
    random_state=None,                      # Controls randomness of the estimator. Set to an integer for reproducibility.
    min_impurity_decrease=0,                # A node will split if the impurity decrease is at least this value. Prevents splits with negligible gain.
    max_leaf_nodes=None,                    # Limits the number of leaf nodes. None means unlimited leaf nodes.
    ccp_alpha=0                             # Complexity parameter for Minimal Cost-Complexity Pruning. Larger values prune more aggressively.
)


param_grid = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    "splitter": ["random", "best"],
    "max_depth": [None, 10, 20, 50, 100],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "max_features": [None, "auto", "sqrt", "log2", 0.5, 1.0],
    "min_impurity_decrease": [0.0, 0.01, 0.1],
    "max_leaf_nodes": [None, 10, 50, 100],
    "ccp_alpha": [0.0, 0.01, 0.1, 1.0]
}

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(
    estimator=extratree_model,               # The ExtraTreeRegressor model
    param_grid=param_grid,   # The hyperparameter grid
    cv=5,                                   # 5-fold cross-validation
    scoring='neg_mean_squared_error',       # Scoring metric
    verbose=1,                              # Verbosity level for logging
    n_jobs=-1                               # Use all available processors
)

#### Adaboost

In [None]:
import sklearn.ensemble

adaboost_model = sklearn.ensemble.AdaBoostRegressor(
    n_estimators=50,                 # Number of weak learners (estimators). Default is 50. Increasing this can improve performance but may lead to overfitting.
    learning_rate=1.0,               # Weight applied to each regressor at each iteration. Lower values slow down learning.
    loss="linear",                   # Loss function to minimize. Options: 'linear' (default), 'square', 'exponential'.
    random_state=None                # Random seed for reproducibility. Set an integer for deterministic results.
)

# Define the hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200, 500],
    "learning_rate": [0.01, 0.1, 1.0, 10.0],
    "loss": ["linear", "square", "exponential"],
    "random_state": [None, 0, 42, 100]
}

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=adaboost_model, 
    param_grid=param_grid, 
    scoring="neg_mean_squared_error",  # Evaluation metric
    cv=5,                             # 5-fold cross-validation
    verbose=1,                        # Verbosity level
    n_jobs=-1                         # Use all available processors
)

####  GBM

In [None]:
import sklearn.ensemble

gbm_model = sklearn.ensemble.GradientBoostingRegressor(
    loss="squared_error",              # Loss function to optimize. Options: 'squared_error', 'absolute_error', 'huber', 'quantile'.
    learning_rate=0.1,                 # Learning rate shrinks the contribution of each tree. Trade-off with n_estimators.
    n_estimators=100,                  # Number of boosting stages to fit. Higher values can improve performance but increase training time.
    subsample=1.0,                     # Fraction of samples to use for fitting each tree. Lower values add randomness for robustness.
    max_depth=3,                       # Maximum depth of individual trees. Limits tree size to control overfitting.
    
    criterion="friedman_mse",          # Criterion for measuring the quality of a split. Options: 'friedman_mse', 'squared_error'.
    min_samples_split=2,               # Minimum number of samples required to split a node.
    min_samples_leaf=1,                # Minimum number of samples required to be at a leaf node.
    min_weight_fraction_leaf=0.0,      # Minimum weighted fraction of samples required to be at a leaf node.
    min_impurity_decrease=0.0,         # Minimum decrease in impurity for a split to be made.
    init=None,                         # Initial estimator for boosting. 'zero' or a prefit model can be used.
    random_state=None,                 # Controls randomness for reproducibility.
    max_features=None,                 # Number of features to consider when looking for the best split.
    alpha=0.9,                         # Quantile for the huber and quantile loss functions.
    verbose=0,                         # Verbosity level for logging. 0 is silent, higher values show more details.
    max_leaf_nodes=None,               # Limits the number of leaf nodes. Helps control overfitting.
    warm_start=False,                  # If True, reuse solution from previous fit calls to add more estimators.
    validation_fraction=0.1,           # Proportion of training data used for early stopping validation.
    n_iter_no_change=None,             # Number of iterations with no improvement to stop training early.
    tol=0.0001,                        # Tolerance for stopping criterion.
    ccp_alpha=0.0                      # Complexity parameter for minimal cost-complexity pruning.
)
param_grid = {
    "loss": ["squared_error", "absolute_error", "huber", "quantile"],
    "learning_rate": [0.01, 0.1, 0.2, 0.5],
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 10, None],
    "subsample": [0.6, 0.8, 1.0],

    "max_features": [None, "auto", "sqrt", "log2"],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "min_impurity_decrease": [0.0, 0.01, 0.1],
    "ccp_alpha": [0.0, 0.01, 0.1]
}


#### Hist GBM

In [None]:
import sklearn.ensemble

xgboost_model = sklearn.ensemble.HistGradientBoostingRegressor(
    loss="squared_error",               # Loss function. Options: "squared_error", "absolute_error", "poisson", "quantile".
    quantile=None,                      # Quantile to optimize when loss="quantile". Ignored for other loss functions.
    learning_rate=0.1,                  # Shrinks the contribution of each tree. Lower values require more iterations.
    max_iter=100,                       # Number of boosting iterations.
    max_leaf_nodes=31,                  # Max leaf nodes per tree. None means unlimited nodes.
    max_depth=None,                     # Max depth of each tree. None means no limit.
    min_samples_leaf=20,                # Minimum number of samples required in a leaf node to reduce overfitting.
    l2_regularization=0.0,              # Strength of L2 regularization to prevent overfitting.
    scoring="loss",                     # Metric used for early stopping. Options: "loss", or a custom scorer.
    
    
    max_bins=255,                       # Maximum number of bins used for feature quantization.
    categorical_features=None,          # Indicates which features are categorical. None means all are treated as numerical.
    monotonic_cst=None,                 # Monotonic constraints for each feature.
    interaction_cst=None,               # Constraints on feature interactions. 
    warm_start=False,                   # Reuse the solution of the previous fit for the next call to fit.
    early_stopping="auto",              # Enable early stopping to terminate training when validation score does not improve.
    validation_fraction=0.1,            # Proportion of training data used as validation data for early stopping.
    n_iter_no_change=10,                # Number of iterations with no improvement to trigger early stopping.
    tol=1e-7,                           # Tolerance for the stopping condition.
    verbose=0,                          # Verbosity level. 0 is silent.
    random_state=None                   # Random seed for reproducibility.
)

param_grid = {
    "loss": ["squared_error", "absolute_error", "poisson", "quantile"],  # Loss function options
    "quantile": [None, 0.1, 0.2, 0.3, 0.4, 0.5],  # Quantile for "quantile" loss, only applicable for quantile loss
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Learning rate options
    "max_iter": [50, 100, 200],  # Number of boosting iterations
    "max_leaf_nodes": [31, 50, 100, None],  # Max leaf nodes per tree, None means unlimited
    "max_depth": [None, 5, 10, 20],  # Max depth of each tree
    "min_samples_leaf": [5, 10, 20, 50],  # Minimum samples per leaf node
    "l2_regularization": [0.0, 0.1, 0.5, 1.0],  # Regularization strength to prevent overfitting
    "early_stopping": [False, "auto"],  # Enable early stopping
    "scoring": ["loss", "neg_mean_squared_error", "r2"],  # Scoring metric for early stopping

    
    "max_bins": [128, 255, 512],  # Maximum number of bins for quantization
    "categorical_features": [None, [0, 1], [0, 2]],  # Categorical feature indices
    "monotonic_cst": [None, {0: 1}, {1: -1}],  # Monotonic constraints (None or a dictionary of constraints)
    "interaction_cst": [None, "pairwise", [(0, 1), (1, 2)]],  # Interaction constraints (None, pairwise or custom constraints)
    "warm_start": [False, True],  # Whether to reuse previous solution
    "early_stopping": [False, "auto"],  # Enable early stopping
    "validation_fraction": [0.05, 0.1, 0.2],  # Fraction of training data used for validation
    "n_iter_no_change": [5, 10, 20],  # Number of iterations with no improvement for early stopping
    "tol": [1e-7, 1e-6, 1e-5],  # Tolerance for stopping condition
    "verbose": [0, 1, 2],  # Verbosity level
    "random_state": [None, 42, 123]  # Random seed for reproducibility
}
