# Ensemble
- Simple ensemble
- Bagging
- Boosting

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving 2000sample_creditcard_fraud.csv to 2000sample_creditcard_fraud.csv
User uploaded file "2000sample_creditcard_fraud.csv" with length 1313929 bytes


## Credit Card Fraud Dataset
- https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
- variables are anonymized

In [None]:
import pandas as pd

# Load the dataset
file_path = '2000sample_creditcard_fraud.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,82450.0,1.314539,0.590643,-0.666593,0.716564,0.301978,-1.125467,0.388881,-0.28839,-0.132137,...,-0.170307,-0.429655,-0.141341,-0.200195,0.639491,0.399476,-0.034321,0.031692,0.76,0
1,50554.0,-0.798672,1.185093,0.904547,0.694584,0.219041,-0.319295,0.495236,0.139269,-0.760214,...,0.202287,0.578699,-0.092245,0.013723,-0.246466,-0.380057,-0.39603,-0.112901,4.18,0
2,55125.0,-0.391128,-0.24554,1.122074,-1.308725,-0.639891,0.008678,-0.701304,-0.027315,-2.628854,...,-0.133485,0.117403,-0.191748,-0.488642,-0.309774,0.0081,0.163716,0.239582,15.0,0
3,116572.0,-0.060302,1.065093,-0.987421,-0.029567,0.176376,-1.348539,0.775644,0.134843,-0.149734,...,0.355576,0.90757,-0.018454,-0.126269,-0.339923,-0.150285,-0.023634,0.04233,57.0,0
4,90434.0,1.848433,0.373364,0.269272,3.866438,0.088062,0.970447,-0.721945,0.235983,0.683491,...,0.103563,0.620954,0.197077,0.692392,-0.20653,-0.021328,-0.019823,-0.042682,0.0,0


## Simple Ensemble (aggregating multiple model output)
- Let's simply try aggregating (by taking the mean) results from `Logit`, `decision tree` models.

In [None]:
# Use all data
data_selected = data

# Splitting the data into features (X) and target (y)
X = data_selected.drop('Class', axis=1)
y = data_selected['Class']

X.head(), y.head()


(       Time        V1        V2        V3        V4        V5        V6  \
 0   82450.0  1.314539  0.590643 -0.666593  0.716564  0.301978 -1.125467   
 1   50554.0 -0.798672  1.185093  0.904547  0.694584  0.219041 -0.319295   
 2   55125.0 -0.391128 -0.245540  1.122074 -1.308725 -0.639891  0.008678   
 3  116572.0 -0.060302  1.065093 -0.987421 -0.029567  0.176376 -1.348539   
 4   90434.0  1.848433  0.373364  0.269272  3.866438  0.088062  0.970447   
 
          V7        V8        V9  ...       V20       V21       V22       V23  \
 0  0.388881 -0.288390 -0.132137  ... -0.058040 -0.170307 -0.429655 -0.141341   
 1  0.495236  0.139269 -0.760214  ... -0.081298  0.202287  0.578699 -0.092245   
 2 -0.701304 -0.027315 -2.628854  ...  0.065716 -0.133485  0.117403 -0.191748   
 3  0.775644  0.134843 -0.149734  ... -0.169706  0.355576  0.907570 -0.018454   
 4 -0.721945  0.235983  0.683491  ... -0.282777  0.103563  0.620954  0.197077   
 
         V24       V25       V26       V27       V28  

In [None]:
from sklearn.model_selection import train_test_split

# Performing the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1744, 30), (748, 30), (1744,), (748,))

### Parameter tuned decision tree

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Custom scorer focusing on F1 score of class 1
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)

# Setting up GridSearchCV with DecisionTreeClassifier
param_grid = {'max_depth': np.arange(1, 10)}  # Default: None
dt_classifier = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=dt_classifier,
                           param_grid=param_grid,
                           scoring=f1_class_1_scorer,
                           cv=5,
                           verbose=1,
                           refit=True)

# Performing GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 5 folds for each of 9 candidates, totalling 45 fits


({'max_depth': 4}, 0.9118508158508158)

In [None]:
import pandas as pd

# Extracting results from GridSearchCV
cv_results = grid_search.cv_results_

# Creating a DataFrame to hold the results
results_df = pd.DataFrame(cv_results)

# Selecting only relevant columns
results_df = results_df[['param_max_depth', 'mean_test_score']]

# Renaming the columns for clarity
results_df.columns = ['Max Depth', 'Mean F1 Score for Class 1']

results_df


Unnamed: 0,Max Depth,Mean F1 Score for Class 1
0,1,0.875187
1,2,0.904864
2,3,0.906871
3,4,0.911851
4,5,0.906792
5,6,0.907065
6,7,0.901012
7,8,0.898319
8,9,0.892348


### Parameter tuned logit

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

# Setting up the Logistic Regression model
logit_model = LogisticRegression(max_iter=1000, random_state=42)

# Defining the parameter grid for 'C'
param_grid_logit = {'C': [0.001, 0.01, 1, 10, 100]}  # Default: 1.0

# Setting up GridSearchCV
grid_search_logit = GridSearchCV(estimator=logit_model,
                                 param_grid=param_grid_logit,
                                 scoring=f1_class_1_scorer,
                                 cv=3,
                                 verbose=1)

# Fit GridSearchCV on your data
# Replace X_train and y_train with your training data
grid_search_logit.fit(X_train, y_train)

# Best parameters and best score
best_params_logit = grid_search_logit.best_params_
best_score_logit = grid_search_logit.best_score_

print("Best Parameters:", best_params_logit)
print("Best F1 Score:", best_score_logit)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'C': 100}
Best F1 Score: 0.9009479007933233


## Bagging (DecisionTree, DecisionTree, Logit)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Creating individual models with specified parameters
decision_tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(max_depth=3, random_state=42)
logit_clf = LogisticRegression(C=100, max_iter=1000, random_state=42)


# Combining the models in a voting ensemble
# We use 'soft' voting to predict the class label based on the argmax of the sums of the predicted probabilities
ensemble_clf = VotingClassifier(
    estimators=[('dt', decision_tree_clf), ('logit', logit_clf), ('dt2', decision_tree_clf2)],
    voting='hard'
)

# Training the ensemble model
ensemble_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_ensemble = ensemble_clf.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1_score = f1_score(y_test, y_pred_ensemble, pos_label=1)

ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1_score



(0.9572192513368984,
 0.9912280701754386,
 0.7847222222222222,
 0.8759689922480621)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Creating individual models with specified parameters
decision_tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(max_depth=3, random_state=42)
logit_clf = LogisticRegression(C=100, max_iter=1000, random_state=42)


# Combining the models in a voting ensemble
# We use 'soft' voting to predict the class label based on the argmax of the sums of the predicted probabilities
ensemble_clf = VotingClassifier(
    estimators=[('dt', decision_tree_clf), ('logit', logit_clf), ('dt2', decision_tree_clf2)],
    voting='soft'
)

# Training the ensemble model
ensemble_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_ensemble = ensemble_clf.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1_score = f1_score(y_test, y_pred_ensemble, pos_label=1)

ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1_score



(0.9545454545454546,
 0.9661016949152542,
 0.7916666666666666,
 0.8702290076335878)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Creating individual models with specified parameters
decision_tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(max_depth=4, random_state=42)
logit_clf = LogisticRegression(C=100, max_iter=1000, random_state=42)


# Combining the models in a voting ensemble
# We use 'soft' voting to predict the class label based on the argmax of the sums of the predicted probabilities
ensemble_clf = VotingClassifier(
    estimators=[('dt', decision_tree_clf), ('dt2', decision_tree_clf2)],
    voting='soft'
)

# Training the ensemble model
ensemble_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_ensemble = ensemble_clf.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1_score = f1_score(y_test, y_pred_ensemble, pos_label=1)

ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1_score



(0.9585561497326203, 0.937984496124031, 0.8402777777777778, 0.8864468864468864)

- ensemble could increase performance when selecting the best performing models
- but the ensemble performance follows that of baseline models

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, average_precision_score

# Creating a custom scorer for AUC-PR
aucpr_scorer = make_scorer(average_precision_score, needs_proba=True)

# Defining the parameter grid
param_grid_gb = {
    'n_estimators': [50, 100, 150]  # default=100 - larger values are likely to lead to overfitting
    'max_depth': [1, 2, 3, 4, 5],  # default=3
    'learning_rate': [0.1, 0.2],  # default=0.1, [0.01 - 0.2]
}

# Setting up the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)

# Setting up GridSearchCV with AUC-PR as the scoring metric
grid_search_gb = GridSearchCV(estimator=gb_clf,
                              param_grid=param_grid_gb,
                              scoring=f1_class_1_scorer,
                              cv=5,
                              verbose=1)

# Fit GridSearchCV on your data
# Replace X_train and y_train with your training data
grid_search_gb.fit(X_train, y_train)

# Best parameters and best score
best_params_gb = grid_search_gb.best_params_
best_score_gb = grid_search_gb.best_score_

print("Best Parameters:", best_params_gb)
print("Best F1 Score:", best_score_gb)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150}
Best AUC-PR Score: 0.9194758217029427


In [None]:
# Training the gb model
gb_clf = GradientBoostingClassifier(random_state=42, learning_rate=0.2, max_depth=5, n_estimators=150)
gb_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_gb = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_precision = precision_score(y_test, y_pred_gb)
gb_recall = recall_score(y_test, y_pred_gb)
gb_f1_score = f1_score(y_test, y_pred_gb, pos_label=1)

gb_accuracy, gb_precision, gb_recall, gb_f1_score


(0.9679144385026738,
 0.9615384615384616,
 0.8680555555555556,
 0.9124087591240877)

## Random Forest


1. **Number of Trees (`n_estimators`):**
   - Determines the number of trees in the forest.
   - Generally, more trees increase the model's performance and robustness but also increase computational time and complexity.

2. **Maximum Depth of Trees (`max_depth`):**
   - Defines the maximum depth of each tree.
   - Deeper trees can model more complex patterns but may lead to overfitting.

3. **Minimum Samples for Split (`min_samples_split`):**
   - The minimum number of samples required to split an internal node.
   - Higher values prevent creating nodes that only fit to noise in the data.

4. **Minimum Samples for Leaf (`min_samples_leaf`):**
   - The minimum number of samples required to be at a leaf node.
   - Useful to control overfitting by smoothing the model, especially for regression.

5. **Maximum Features (`max_features`):**
   - The number of features to consider when looking for the best split.
   - Choices include 'auto', 'sqrt', 'log2', or a fraction of the total features.


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

# Custom scorers for evaluation
scorers = {
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'f1': make_scorer(f1_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(roc_auc_score)
}

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],        # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],        # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],          # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt'],       # The number of features to consider when looking for the best split
    'class_weight': [None, 'balanced']      # Weights associated with classes in the form {class_label: weight}
}

# Grid Search for hyperparameter tuning with custom scorers
grid_search_rf = GridSearchCV(estimator=rf_classifier,
                              param_grid=param_grid,
                              scoring=scorers,
                              refit='f1',  # Refitting on the basis of F1 score, you can choose any other metric
                              cv=5,
                              verbose=2,
                              n_jobs=-1)

# Fitting the model
grid_search_rf.fit(X_train, y_train)

# Extract the best parameters
best_rf_params = grid_search_rf.best_params_
print("Best Parameters:", best_rf_params)

# Predicting and evaluating on the test set
y_pred_rf = grid_search_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1_score = f1_score(y_test, y_pred_rf, pos_label=1)

rf_accuracy, rf_precision, rf_recall, rf_f1_score


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


  warn(


Best Parameters: {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


(0.9652406417112299,
 0.9538461538461539,
 0.8611111111111112,
 0.9051094890510949)

## XGBoost

1. Learning Rate (`learning_rate`): This controls the step size at each iteration while moving towards a minimum of a loss function. Smaller values make the optimization process more robust at the cost of requiring more boosting rounds.

2. Number of Trees (`n_estimators`): Represents the number of boosting rounds or trees to build. Too many trees can lead to overfitting.

3. Tree Complexity (`max_depth`, `min_child_weight`, etc.): Controls the depth of the trees. Deeper trees can model more complex patterns but also can lead to overfitting.

4. Regularization (`lambda`, `alpha`): These parameters can help to prevent overfitting by adding a regularization penalty to the loss function.

5. Handling Imbalanced Data (`scale_pos_weight`): Important for classification problems where classes are imbalanced.

6. Subsampling (`subsample`, `colsample_bytree`): These parameters control the sampling of the dataset that is done at each boosting round, which can help in preventing overfitting.

### Steps for XGBoost tuning (Iterative refinement)
1. Start tuning with high-impact parameters
- `n_estimators`
- `max_depth`
- `learning_rate`

2. then refine the model by adjusting the other parameters
- `min_child_weight`
- `min_samples_split`
- `min_samples_leaf`
- `max_features`
- `scale_pos_weight`
- `alpha`

In [None]:
!pip install xgboost



In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Custom scorers
scorers = {
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'f1': make_scorer(f1_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(roc_auc_score)
}

# XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=42)

# Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200], # Number of boosting rounds, default=100
    'max_depth': [3, 5, 7], # default=6
    'learning_rate': [0.01, 0.1, 0.2] # default=0.3
}

# Grid Search with multiple scorers
grid_search = GridSearchCV(estimator=xgb_classifier,
                           param_grid=param_grid,
                           scoring=scorers,
                           refit='f1',  # Can change to any key in scorers dictionary
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

# Fitting the model
grid_search.fit(X_train, y_train)

# Accessing the results
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_precision,split1_test_precision,...,std_test_accuracy,rank_test_accuracy,split0_test_auc,split1_test_auc,split2_test_auc,split3_test_auc,split4_test_auc,mean_test_auc,std_test_auc,rank_test_auc
0,0.317398,0.089471,0.044281,0.020264,0.01,3,50,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",1.0,1.0,...,0.00825,27,0.92029,0.864286,0.9,0.891065,0.875019,0.890132,0.019515,27
1,0.592543,0.089277,0.043468,0.015272,0.01,3,100,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",1.0,1.0,...,0.010234,24,0.934783,0.935714,0.921429,0.898208,0.875019,0.913031,0.023336,24
2,0.961698,0.117588,0.032066,0.011464,0.01,3,200,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",1.0,1.0,...,0.008626,23,0.934783,0.935714,0.921429,0.917844,0.882266,0.918407,0.019408,23
3,0.412309,0.105575,0.031115,0.014103,0.01,5,50,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",1.0,1.0,...,0.00853,26,0.927536,0.871429,0.9,0.898208,0.875019,0.894438,0.020234,26
4,1.416868,0.300368,0.044277,0.009698,0.01,5,100,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",1.0,1.0,...,0.008474,22,0.934783,0.935714,0.921429,0.919636,0.88772,0.919856,0.017375,21
5,1.674876,0.59312,0.027249,0.011764,0.01,5,200,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.983607,1.0,...,0.005765,18,0.932997,0.942857,0.921429,0.919636,0.909459,0.925276,0.011537,19
6,0.399834,0.027738,0.01982,0.000752,0.01,7,50,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",1.0,1.0,...,0.009515,25,0.934783,0.871429,0.9,0.898208,0.875019,0.895888,0.022666,25
7,0.818281,0.037025,0.020553,0.000444,0.01,7,100,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",1.0,1.0,...,0.007646,20,0.934783,0.935714,0.921429,0.912494,0.889512,0.918786,0.017006,22
8,1.628541,0.062709,0.022647,0.000826,0.01,7,200,"{'learning_rate': 0.01, 'max_depth': 7, 'n_est...",0.983607,1.0,...,0.007858,20,0.932997,0.942857,0.921429,0.917844,0.894966,0.922019,0.016152,20
9,0.163131,0.006421,0.019444,0.000981,0.1,3,50,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.983607,1.0,...,0.007047,19,0.932997,0.95,0.926779,0.923195,0.909459,0.928486,0.013237,18


In [None]:
grid_search_results.sort_values(by='rank_test_f1')[['mean_test_f1','std_test_f1','params']][:10]

Unnamed: 0,mean_test_f1,std_test_f1,params
26,0.924167,0.015521,"{'learning_rate': 0.2, 'max_depth': 7, 'n_esti..."
23,0.924096,0.016428,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti..."
13,0.92265,0.01641,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
22,0.922543,0.012449,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti..."
14,0.921013,0.018426,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
21,0.920902,0.013527,"{'learning_rate': 0.2, 'max_depth': 5, 'n_esti..."
19,0.920797,0.017603,"{'learning_rate': 0.2, 'max_depth': 3, 'n_esti..."
12,0.92062,0.013114,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti..."
25,0.92062,0.013114,"{'learning_rate': 0.2, 'max_depth': 7, 'n_esti..."
15,0.920601,0.016281,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti..."


- Fix {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Fixed parameters
fixed_params = {
    'n_estimators': 200,
    'max_depth': 7,
    'learning_rate': 0.2
}

# Hyperparameters to tune
param_grid = {
    'min_child_weight': [1, 5, 10], # default=1
    'min_samples_split': [2, 5, 10], # default=2
    'min_samples_leaf': [1, 2, 4], # default=1
    'max_features': ['auto', 'sqrt'], # default='auto'
    'scale_pos_weight': [1, 10, 25], # default=1
    'alpha': [0, 0.1, 0.5, 1] # default=0
}

# Updating XGBoost Classifier with fixed parameters
xgb_classifier_fixed = XGBClassifier(**fixed_params, random_state=42)

# Grid Search for hyperparameter tuning
grid_search = GridSearchCV(estimator=xgb_classifier_fixed,
                           param_grid=param_grid,
                           scoring='f1',  # focusing on F1 score for imbalanced classification
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

# Fitting the model
grid_search.fit(X_train, y_train)

# Extract the best parameters
best_xgb_params = grid_search.best_params_
print("Best Parameters:", best_xgb_params)


Fitting 5 folds for each of 648 candidates, totalling 3240 fits


Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



Best Parameters: {'alpha': 0.1, 'max_features': 'auto', 'min_child_weight': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'scale_pos_weight': 10}


In [None]:
pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score')[['mean_test_score','std_test_score','params']][:10]

Unnamed: 0,mean_test_score,std_test_score,params
169,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'auto', 'min_ch..."
268,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
265,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
262,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
259,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
256,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
250,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
247,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
244,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."
253,0.929356,0.014732,"{'alpha': 0.1, 'max_features': 'sqrt', 'min_ch..."


In [None]:
pd.DataFrame(grid_search.cv_results_).params[169]

{'alpha': 0.1,
 'max_features': 'auto',
 'min_child_weight': 1,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'scale_pos_weight': 10}

In [None]:
pd.DataFrame(grid_search_rf.cv_results_).sort_values(by='rank_test_f1')[['mean_test_f1','std_test_f1','params']][:10]

Unnamed: 0,mean_test_f1,std_test_f1,params
322,0.925137,0.014962,"{'class_weight': 'balanced', 'max_depth': 10, ..."
295,0.925137,0.014962,"{'class_weight': 'balanced', 'max_depth': 10, ..."
400,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 30, ..."
397,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 30, ..."
370,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 20, ..."
373,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 20, ..."
238,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': None..."
319,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 10, ..."
424,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 30, ..."
316,0.924829,0.015467,"{'class_weight': 'balanced', 'max_depth': 10, ..."
