In [1]:
import os
import numpy as np
import pandas as pd
import time
from pathlib import Path
import sys




from sklearn import model_selection
from sklearn.model_selection import GroupKFold
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

#Models

import xgboost as xgb
from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#Feature engineerring
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

#Splitting the data
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score

# Hyperparameter optimization

import optuna


import warnings
warnings.filterwarnings("ignore")


In [2]:
#Special packages
!pip install feature_engine
from feature_engine.encoding import WoEEncoder

!git clone https://github.com/analokmaus/kuma_utils.git
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.4.0
[0mCloning into 'kuma_utils'...
remote: Enumerating objects: 915, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 915 (delta 104), reused 102 (delta 96), pack-reused 795[K
Receiving objects: 100% (915/915), 679.99 KiB | 2.83 MiB/s, done.
Resolving deltas: 100% (592/592), done.


In [3]:
# The config file class

class Config:
    path = "../input/tabular-playground-series-aug-2022"
    target = "failure"

In [4]:
# def read_data(path):
#     data_dir = Path(path)

#     train_import = pd.read_csv(data_dir / "train.csv")
#     test_import = pd.read_csv(data_dir / "test.csv")
#     submission_df = pd.read_csv(data_dir / "sample_submission.csv")

#     print(f"train data: Rows={train_import.shape[0]}, Columns={train_import.shape[1]}")
#     print(f"test data : Rows={test_import.shape[0]}, Columns={test_import.shape[1]}")
#     return train_import, test_import, submission_df

In [5]:
data_dir = Path(Config.path)

train_import = pd.read_csv(data_dir / "train.csv", index_col="id")
test_import = pd.read_csv(data_dir / "test.csv", index_col="id")
submission_df = pd.read_csv(data_dir / "sample_submission.csv")

In [6]:
train_import.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


## Initial data cleaning and re-casting of values

In [7]:
# This step is not needed for this competition

## Feature engineering pipeline

In [8]:
# This will be a custom step for each competition

In [9]:
def preprocessing(df_train = train_import, df_test = test_import):

 #Inputs: The total training and test data set
     
    # Missing indicator
    for df in [df_train, df_test]:
        #Add a 1 whereever there was a missing value
        df["m_3_missing"] = df["measurement_3"].isnull().astype(int)
        df["m_5_missing"] = df["measurement_5"].isnull().astype(int)

        
    # Missing value imputation
    imptr = LGBMImputer(n_iter=50) # Setup the LGBM imputer
    def impute_nans(df_train, df_test):
        nan_features = [col for col in df_train.columns if df_train[col].isnull().any()] # Selects the columns with nan values
        for pc in df_train["product_code"].unique(): # Loops through the unique values for the product code
            #Selects the rows for each unique feature (similar to groupby) and fit_transform the nan values
            df_train.loc[df_train["product_code"]==pc, nan_features] = imptr.fit_transform(df_train.loc[df_train["product_code"]==pc, nan_features])
        
        #Apply the same process to the test set
        nan_features = [col for col in df_test.columns if df_test[col].isnull().any()]
        for pc in df_test["product_code"].unique():
            df_test.loc[df_test["product_code"]==pc, nan_features] = imptr.fit_transform(df_test.loc[df_test["product_code"]==pc, nan_features])

        return df_train, df_test
    
    df_train, df_test = impute_nans(df_train, df_test)
    
    # Area
    for df in [df_train, df_test]: # Loop through the train and test data set and create a new feature by calculation the area
        df["attribute_2*3"] = df["attribute_2"] * df["attribute_3"]
    
    
    # Aggregations
    # Automated way to create additional features by using a list comprehension and f-strings - Output is a list
    meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 17))]
    meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
    # loop through the train and test data set and  
    for df in [df_train, df_test]:
        df["meas_gr1_avg"] = np.mean(df[meas_gr1_cols], axis=1) # Calculate the mean of those columns
        df["meas_gr1_std"] = np.std(df[meas_gr1_cols], axis=1) # # Calculate the std, dev of those columns
        df["meas_gr2_avg"] = np.mean(df[meas_gr2_cols], axis=1) # Calculate the mean for the second list of features
    
    # Create a new feature by looping through the train and test set and dividing measure_17 by the average of meas_2_avg
    for df in [df_train, df_test]:
        df["meas17/meas_gr2_avg"] = df["measurement_17"] / df["meas_gr2_avg"]
        
        #Applies the Weight of Evidence encoder to the categorical feature "attribute_0"
        # WoE encoding
    woe_encoder = WoEEncoder(variables=["attribute_0"])
    df_train["attribute_0"] = woe_encoder.fit_transform(df_train["attribute_0"].to_frame(), df_train["failure"])
    df_test["attribute_0"] = woe_encoder.transform(df_test["attribute_0"].to_frame())
    
    
    features = ["attribute_0", "measurement_0", "measurement_1", "measurement_2", "m_3_missing", "m_5_missing",
               "meas_gr1_avg", "meas_gr1_std", "attribute_2*3", "loading", "measurement_17", "meas17/meas_gr2_avg"] \
#                 + list(ohe_att_0_att_23.columns) + ["gb_attribute_2*3_mean_loading"] + list(ohe_att_23.columns)
    
#     del df_all, ohe_att_23, ohe_att_0_att_23
#     gc.collect()
    
    return train_import, test_import, features

In [10]:
# Take-aways: Use list comprehension (including if statements) to select columns for transformation, Use a for loop to apply the transformation to the train and test set

In [11]:
df_train_proc, df_test_proc, features = preprocessing(train_import, test_import)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [12]:
# Define the target and the feature matrix
df_train_X = df_train_proc.drop(Config.target, axis = 1)

df_train_y = df_train_proc["failure"]




In [13]:
df_train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26570 entries, 0 to 26569
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_code         26570 non-null  object 
 1   loading              26570 non-null  float64
 2   attribute_0          26570 non-null  float64
 3   attribute_1          26570 non-null  object 
 4   attribute_2          26570 non-null  int64  
 5   attribute_3          26570 non-null  int64  
 6   measurement_0        26570 non-null  int64  
 7   measurement_1        26570 non-null  int64  
 8   measurement_2        26570 non-null  int64  
 9   measurement_3        26570 non-null  float64
 10  measurement_4        26570 non-null  float64
 11  measurement_5        26570 non-null  float64
 12  measurement_6        26570 non-null  float64
 13  measurement_7        26570 non-null  float64
 14  measurement_8        26570 non-null  float64
 15  measurement_9        26570 non-null 

In [14]:
# Define the cross-validation approach

# Initialise lists that store the results from the different folds

def run(trial):

    
    xgb_params = { "eval_metric": "auc",  # auc, rmse, mae
        "objective": "binary:logistic",
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": "gbtree",
        "tree_method": trial.suggest_categorical("tree_method", ["gpu_hist"]), # hist, gpu_hist
        #"predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state",[42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
           }

    
    
    
    auc_list = []
    importance_list = []


    # initialise the cross-validation "class"

    KFOLD = GroupKFold(n_splits=2)


    for train_index, val_index in KFOLD.split(X = df_train_X, y = df_train_y, groups = df_train_X["product_code"]):
        #Define the train and validation data set
        X_train = df_train_X.iloc[train_index]
        X_val = df_train_X.iloc[val_index]
        y_train = df_train_y.iloc[train_index]
        y_val = df_train_y.iloc[val_index]

        # Quick fix: Remove the product code variable
        X_train = X_train.drop(["product_code", "attribute_1"], axis = 1).copy()
        X_val = X_val.drop(["product_code", "attribute_1"], axis = 1).copy()
        # Instaniate the model

        model = xgb.XGBClassifier(**xgb_params)
        #model = LogisticRegression(max_iter = 200, C=0.05, penalty='l1', solver='liblinear')

        #Fit the model
        model.fit(X_train, y_train)

        # Make predictions on the validation set
        val_pred_results = model.predict_proba(X_val)[:,1]
        # Score the prediction by using a scoring function. The true value are the first argument, the predictions the second argument
        score = roc_auc_score( y_val,val_pred_results)
        print(f"The ROC is {score:.4f}")
        # Append the ROC result from each fold to a list
        auc_list.append(score)
        # Append the results from the importance score to a list
        #importance_list.append(model.coef_.ravel())

        print(f"The average ROC is {np.mean(auc_list)}")
        
        return np.mean(auc_list)

In [15]:
study = optuna.create_study(direction="maximize")
study.optimize(run, n_trials=50)

[32m[I 2022-08-20 16:46:07,265][0m A new study created in memory with name: no-name-21a390e8-1416-4e6c-a4ae-f2bf0019cc7c[0m
[32m[I 2022-08-20 16:46:11,167][0m Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 3100, 'learning_rate': 0.02729500881280114, 'subsample': 0.27, 'colsample_bytree': 0.42, 'max_depth': 10, 'gamma': 68.7, 'tree_method': 'gpu_hist', 'reg_lambda': 4.529440944443491e-06, 'reg_alpha': 0.0004676813295945299, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 632.4685199030143}. Best is trial 0 with value: 0.5.[0m


The ROC is 0.5000
The average ROC is 0.5


[32m[I 2022-08-20 16:46:12,087][0m Trial 1 finished with value: 0.5849926782536605 and parameters: {'n_estimators': 1500, 'learning_rate': 0.011692381796203091, 'subsample': 0.86, 'colsample_bytree': 0.67, 'max_depth': 7, 'gamma': 23.6, 'tree_method': 'gpu_hist', 'reg_lambda': 0.00019514274659555297, 'reg_alpha': 5.287665408903801e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.773694778307274}. Best is trial 1 with value: 0.5849926782536605.[0m


The ROC is 0.5850
The average ROC is 0.5849926782536605


[32m[I 2022-08-20 16:46:14,474][0m Trial 2 finished with value: 0.5842186256394609 and parameters: {'n_estimators': 3600, 'learning_rate': 0.08745268163286028, 'subsample': 0.98, 'colsample_bytree': 0.27, 'max_depth': 8, 'gamma': 14.700000000000001, 'tree_method': 'gpu_hist', 'reg_lambda': 8.211271424350196e-05, 'reg_alpha': 1.459205736228473e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.23779523084096}. Best is trial 1 with value: 0.5849926782536605.[0m


The ROC is 0.5842
The average ROC is 0.5842186256394609


[32m[I 2022-08-20 16:46:18,624][0m Trial 3 finished with value: 0.5697858527199862 and parameters: {'n_estimators': 5000, 'learning_rate': 0.06886861648683226, 'subsample': 0.29000000000000004, 'colsample_bytree': 0.4, 'max_depth': 15, 'gamma': 86.10000000000001, 'tree_method': 'gpu_hist', 'reg_lambda': 0.24438873520869117, 'reg_alpha': 8.555043594767746, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.11572842380801525}. Best is trial 1 with value: 0.5849926782536605.[0m


The ROC is 0.5698
The average ROC is 0.5697858527199862


[32m[I 2022-08-20 16:46:19,849][0m Trial 4 finished with value: 0.5859662346551955 and parameters: {'n_estimators': 2000, 'learning_rate': 0.04469797248627319, 'subsample': 0.6799999999999999, 'colsample_bytree': 0.5900000000000001, 'max_depth': 5, 'gamma': 21.900000000000002, 'tree_method': 'gpu_hist', 'reg_lambda': 6.353775943902091e-06, 'reg_alpha': 1.2197255120210858e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 10.548761983057128}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5860
The average ROC is 0.5859662346551955


[32m[I 2022-08-20 16:46:22,462][0m Trial 5 finished with value: 0.5 and parameters: {'n_estimators': 4600, 'learning_rate': 0.12674235501040393, 'subsample': 0.14, 'colsample_bytree': 0.49, 'max_depth': 4, 'gamma': 29.1, 'tree_method': 'gpu_hist', 'reg_lambda': 2.281590941066338e-06, 'reg_alpha': 0.2510063440928101, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 607.2873754279207}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5000
The average ROC is 0.5


[32m[I 2022-08-20 16:46:23,469][0m Trial 6 finished with value: 0.5847744811797496 and parameters: {'n_estimators': 1300, 'learning_rate': 0.16393666171857751, 'subsample': 0.31, 'colsample_bytree': 0.64, 'max_depth': 14, 'gamma': 23.6, 'tree_method': 'gpu_hist', 'reg_lambda': 0.0005196361411046652, 'reg_alpha': 4.6984428611422195e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.3075661433724609}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5848
The average ROC is 0.5847744811797496


[32m[I 2022-08-20 16:46:28,225][0m Trial 7 finished with value: 0.5836037545152442 and parameters: {'n_estimators': 1000, 'learning_rate': 0.015904806867297203, 'subsample': 0.8, 'colsample_bytree': 0.66, 'max_depth': 19, 'gamma': 35.300000000000004, 'tree_method': 'gpu_hist', 'reg_lambda': 2.9660469935931137e-07, 'reg_alpha': 9.008060278901013e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.39952848882671793}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5836
The average ROC is 0.5836037545152442


[32m[I 2022-08-20 16:46:30,708][0m Trial 8 finished with value: 0.5829585726659487 and parameters: {'n_estimators': 4500, 'learning_rate': 0.19977576104423286, 'subsample': 0.31, 'colsample_bytree': 0.8800000000000001, 'max_depth': 5, 'gamma': 33.300000000000004, 'tree_method': 'gpu_hist', 'reg_lambda': 0.05658166338363368, 'reg_alpha': 0.02475735014573451, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 16.221555880724896}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5830
The average ROC is 0.5829585726659487


[32m[I 2022-08-20 16:46:33,583][0m Trial 9 finished with value: 0.5838756979245183 and parameters: {'n_estimators': 4900, 'learning_rate': 0.01699096369300014, 'subsample': 0.38, 'colsample_bytree': 0.5700000000000001, 'max_depth': 8, 'gamma': 37.2, 'tree_method': 'gpu_hist', 'reg_lambda': 0.285266355780781, 'reg_alpha': 0.01637776019198998, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 32.3885490904579}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5839
The average ROC is 0.5838756979245183


[32m[I 2022-08-20 16:46:34,837][0m Trial 10 finished with value: 0.5829296921654805 and parameters: {'n_estimators': 2200, 'learning_rate': 0.03619553503571434, 'subsample': 0.64, 'colsample_bytree': 0.05, 'max_depth': 2, 'gamma': 57.7, 'tree_method': 'gpu_hist', 'reg_lambda': 1.6472304718926524e-08, 'reg_alpha': 8.600481093129707e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 79.15747901281718}. Best is trial 4 with value: 0.5859662346551955.[0m


The ROC is 0.5829
The average ROC is 0.5829296921654805


[32m[I 2022-08-20 16:46:35,963][0m Trial 11 finished with value: 0.5866953982899532 and parameters: {'n_estimators': 1900, 'learning_rate': 0.010056027116517085, 'subsample': 0.72, 'colsample_bytree': 0.92, 'max_depth': 1, 'gamma': 2.9000000000000004, 'tree_method': 'gpu_hist', 'reg_lambda': 73.83999054467714, 'reg_alpha': 1.7608520098201057e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.889298878898304}. Best is trial 11 with value: 0.5866953982899532.[0m


The ROC is 0.5867
The average ROC is 0.5866953982899532


[32m[I 2022-08-20 16:46:37,073][0m Trial 12 finished with value: 0.576483870579226 and parameters: {'n_estimators': 2000, 'learning_rate': 0.04468502148416436, 'subsample': 0.6, 'colsample_bytree': 1.0, 'max_depth': 1, 'gamma': 0.4, 'tree_method': 'gpu_hist', 'reg_lambda': 51.50927170019273, 'reg_alpha': 1.6561769817640327e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 3.5050224803039427}. Best is trial 11 with value: 0.5866953982899532.[0m


The ROC is 0.5765
The average ROC is 0.576483870579226


[32m[I 2022-08-20 16:46:38,465][0m Trial 13 finished with value: 0.5845323968456773 and parameters: {'n_estimators': 2200, 'learning_rate': 0.02638517546017921, 'subsample': 0.74, 'colsample_bytree': 0.8500000000000001, 'max_depth': 4, 'gamma': 6.5, 'tree_method': 'gpu_hist', 'reg_lambda': 19.811994252917103, 'reg_alpha': 1.006586147387355e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.671100072236775}. Best is trial 11 with value: 0.5866953982899532.[0m


The ROC is 0.5845
The average ROC is 0.5845323968456773


[32m[I 2022-08-20 16:46:39,955][0m Trial 14 finished with value: 0.589091802353999 and parameters: {'n_estimators': 2700, 'learning_rate': 0.07202357961833306, 'subsample': 0.48, 'colsample_bytree': 0.81, 'max_depth': 1, 'gamma': 12.8, 'tree_method': 'gpu_hist', 'reg_lambda': 0.01019858374146423, 'reg_alpha': 1.724175431897858e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 111.88396767915158}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5891
The average ROC is 0.589091802353999


[32m[I 2022-08-20 16:46:41,552][0m Trial 15 finished with value: 0.5837970857542604 and parameters: {'n_estimators': 2900, 'learning_rate': 0.0774339899602183, 'subsample': 0.49, 'colsample_bytree': 0.8, 'max_depth': 2, 'gamma': 51.300000000000004, 'tree_method': 'gpu_hist', 'reg_lambda': 0.009706931252562117, 'reg_alpha': 8.210155065647576e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 120.67292863617078}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5838
The average ROC is 0.5837970857542604


[32m[I 2022-08-20 16:46:43,539][0m Trial 16 finished with value: 0.5817791893788007 and parameters: {'n_estimators': 2800, 'learning_rate': 0.11246924894283314, 'subsample': 0.44000000000000006, 'colsample_bytree': 0.9700000000000001, 'max_depth': 13, 'gamma': 10.0, 'tree_method': 'gpu_hist', 'reg_lambda': 3.788986493137477, 'reg_alpha': 3.322779688912687e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 179.97121775857732}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5818
The average ROC is 0.5817791893788007


[32m[I 2022-08-20 16:46:45,543][0m Trial 17 finished with value: 0.5772620131339784 and parameters: {'n_estimators': 3700, 'learning_rate': 0.010036378890009532, 'subsample': 0.56, 'colsample_bytree': 0.76, 'max_depth': 1, 'gamma': 95.5, 'tree_method': 'gpu_hist', 'reg_lambda': 0.004018017692848727, 'reg_alpha': 3.722980395161766e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 28.945882437260344}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5773
The average ROC is 0.5772620131339784


[32m[I 2022-08-20 16:46:58,175][0m Trial 18 finished with value: 0.5771556495879447 and parameters: {'n_estimators': 2600, 'learning_rate': 0.02486445716221551, 'subsample': 0.93, 'colsample_bytree': 0.92, 'max_depth': 19, 'gamma': 43.900000000000006, 'tree_method': 'gpu_hist', 'reg_lambda': 1.6172291503419214, 'reg_alpha': 40.90762799136778, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.094541107784321}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5772
The average ROC is 0.5771556495879447


[32m[I 2022-08-20 16:47:04,666][0m Trial 19 finished with value: 0.5321252535512226 and parameters: {'n_estimators': 3400, 'learning_rate': 0.07265094245266003, 'subsample': 0.49, 'colsample_bytree': 0.7400000000000001, 'max_depth': 11, 'gamma': 1.0, 'tree_method': 'gpu_hist', 'reg_lambda': 70.8301740741036, 'reg_alpha': 0.0010111027761762813, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 64.7080439499952}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5321
The average ROC is 0.5321252535512226


[32m[I 2022-08-20 16:47:05,679][0m Trial 20 finished with value: 0.5865381237661176 and parameters: {'n_estimators': 1700, 'learning_rate': 0.051110368498270185, 'subsample': 0.69, 'colsample_bytree': 0.8400000000000001, 'max_depth': 6, 'gamma': 16.3, 'tree_method': 'gpu_hist', 'reg_lambda': 0.016866501580511752, 'reg_alpha': 2.101796909925124e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 300.3322553751792}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5865
The average ROC is 0.5865381237661176


[32m[I 2022-08-20 16:47:06,664][0m Trial 21 finished with value: 0.585547981777433 and parameters: {'n_estimators': 1700, 'learning_rate': 0.058890849372027536, 'subsample': 0.72, 'colsample_bytree': 0.8400000000000001, 'max_depth': 3, 'gamma': 15.3, 'tree_method': 'gpu_hist', 'reg_lambda': 0.03130444219985347, 'reg_alpha': 9.383940639291447e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 361.12564117156575}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5855
The average ROC is 0.585547981777433


[32m[I 2022-08-20 16:47:08,059][0m Trial 22 finished with value: 0.5859780026436573 and parameters: {'n_estimators': 2500, 'learning_rate': 0.054590424367595196, 'subsample': 0.8, 'colsample_bytree': 0.7400000000000001, 'max_depth': 6, 'gamma': 14.600000000000001, 'tree_method': 'gpu_hist', 'reg_lambda': 0.002181502344655315, 'reg_alpha': 1.038899522386389e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 252.14718048483914}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5860
The average ROC is 0.5859780026436573


[32m[I 2022-08-20 16:47:08,737][0m Trial 23 finished with value: 0.5763327936953693 and parameters: {'n_estimators': 1100, 'learning_rate': 0.034894866753858, 'subsample': 0.6, 'colsample_bytree': 0.92, 'max_depth': 3, 'gamma': 6.1000000000000005, 'tree_method': 'gpu_hist', 'reg_lambda': 2.2067352937996985, 'reg_alpha': 3.1591969321275395e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 975.7972100395128}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5763
The average ROC is 0.5763327936953693


[32m[I 2022-08-20 16:47:09,718][0m Trial 24 finished with value: 0.5845020610289386 and parameters: {'n_estimators': 1700, 'learning_rate': 0.11005083488111853, 'subsample': 0.84, 'colsample_bytree': 0.9400000000000001, 'max_depth': 1, 'gamma': 19.5, 'tree_method': 'gpu_hist', 'reg_lambda': 6.234133712830785e-05, 'reg_alpha': 1.8362787261328054e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 41.506980091301415}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5845
The average ROC is 0.5845020610289386


[32m[I 2022-08-20 16:47:11,454][0m Trial 25 finished with value: 0.581997787919269 and parameters: {'n_estimators': 2400, 'learning_rate': 0.018882310207066034, 'subsample': 0.53, 'colsample_bytree': 0.8200000000000001, 'max_depth': 10, 'gamma': 7.9, 'tree_method': 'gpu_hist', 'reg_lambda': 0.25432019039680975, 'reg_alpha': 1.5352389695391118e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.661773031288935}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5820
The average ROC is 0.581997787919269


[32m[I 2022-08-20 16:47:12,425][0m Trial 26 finished with value: 0.5818233005167879 and parameters: {'n_estimators': 1700, 'learning_rate': 0.24311514838109283, 'subsample': 0.69, 'colsample_bytree': 0.7100000000000001, 'max_depth': 6, 'gamma': 68.3, 'tree_method': 'gpu_hist', 'reg_lambda': 7.13798722241903, 'reg_alpha': 6.281973818630339e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 106.58065689763818}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5818
The average ROC is 0.5818233005167879


[32m[I 2022-08-20 16:47:14,108][0m Trial 27 finished with value: 0.5837024149217089 and parameters: {'n_estimators': 3200, 'learning_rate': 0.03569910032234025, 'subsample': 0.78, 'colsample_bytree': 1.0, 'max_depth': 3, 'gamma': 43.300000000000004, 'tree_method': 'gpu_hist', 'reg_lambda': 0.027665945701964772, 'reg_alpha': 2.554175378569576e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 16.592917623894373}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5837
The average ROC is 0.5837024149217089


[32m[I 2022-08-20 16:47:15,809][0m Trial 28 finished with value: 0.5658880139148308 and parameters: {'n_estimators': 2000, 'learning_rate': 0.09558204896085046, 'subsample': 0.42000000000000004, 'colsample_bytree': 0.8800000000000001, 'max_depth': 7, 'gamma': 0.5, 'tree_method': 'gpu_hist', 'reg_lambda': 0.00047020917768232303, 'reg_alpha': 4.603948424123679e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 383.0614909439559}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5659
The average ROC is 0.5658880139148308


[32m[I 2022-08-20 16:47:16,679][0m Trial 29 finished with value: 0.5837817547501022 and parameters: {'n_estimators': 1300, 'learning_rate': 0.021067833906801125, 'subsample': 0.13, 'colsample_bytree': 0.4, 'max_depth': 10, 'gamma': 29.6, 'tree_method': 'gpu_hist', 'reg_lambda': 1.215425995483853e-05, 'reg_alpha': 0.0020954861934568245, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.9495462424808992}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5838
The average ROC is 0.5837817547501022


[32m[I 2022-08-20 16:47:18,488][0m Trial 30 finished with value: 0.5803660270969853 and parameters: {'n_estimators': 3100, 'learning_rate': 0.14391787811152276, 'subsample': 0.65, 'colsample_bytree': 0.5, 'max_depth': 4, 'gamma': 74.10000000000001, 'tree_method': 'gpu_hist', 'reg_lambda': 0.007404139113575026, 'reg_alpha': 3.7724741029519595e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 188.914555319584}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5804
The average ROC is 0.5803660270969853


[32m[I 2022-08-20 16:47:19,931][0m Trial 31 finished with value: 0.5866577858918629 and parameters: {'n_estimators': 2500, 'learning_rate': 0.0595573415270696, 'subsample': 0.86, 'colsample_bytree': 0.76, 'max_depth': 9, 'gamma': 15.5, 'tree_method': 'gpu_hist', 'reg_lambda': 0.002495406364406242, 'reg_alpha': 1.1653202910443702e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 394.096021000348}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5867
The average ROC is 0.5866577858918629


[32m[I 2022-08-20 16:47:21,579][0m Trial 32 finished with value: 0.5863721424363199 and parameters: {'n_estimators': 2700, 'learning_rate': 0.060957628223466266, 'subsample': 0.88, 'colsample_bytree': 0.78, 'max_depth': 12, 'gamma': 14.3, 'tree_method': 'gpu_hist', 'reg_lambda': 0.001450388608605151, 'reg_alpha': 0.0001892458230217842, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 667.9685703101823}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5864
The average ROC is 0.5863721424363199


[32m[I 2022-08-20 16:47:22,870][0m Trial 33 finished with value: 0.5825830759765422 and parameters: {'n_estimators': 2300, 'learning_rate': 0.0467953354606732, 'subsample': 1.0, 'colsample_bytree': 0.6900000000000001, 'max_depth': 9, 'gamma': 27.1, 'tree_method': 'gpu_hist', 'reg_lambda': 0.00015955497621810658, 'reg_alpha': 3.056277450250837e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 392.8596171610889}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5826
The average ROC is 0.5825830759765422


[32m[I 2022-08-20 16:47:23,793][0m Trial 34 finished with value: 0.5869018273754023 and parameters: {'n_estimators': 1500, 'learning_rate': 0.09305604712582927, 'subsample': 0.91, 'colsample_bytree': 0.89, 'max_depth': 8, 'gamma': 9.8, 'tree_method': 'gpu_hist', 'reg_lambda': 0.07505784421844025, 'reg_alpha': 2.443298769803554e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.657012841792942}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5869
The average ROC is 0.5869018273754023


[32m[I 2022-08-20 16:47:28,258][0m Trial 35 finished with value: 0.5877513055190612 and parameters: {'n_estimators': 3900, 'learning_rate': 0.08699980262227443, 'subsample': 0.91, 'colsample_bytree': 0.3, 'max_depth': 16, 'gamma': 10.0, 'tree_method': 'gpu_hist', 'reg_lambda': 0.6787285876831695, 'reg_alpha': 1.6030178554244015e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.8472989609496215}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5878
The average ROC is 0.5877513055190612


[32m[I 2022-08-20 16:47:32,809][0m Trial 36 finished with value: 0.5877740134712103 and parameters: {'n_estimators': 4100, 'learning_rate': 0.08998324782343521, 'subsample': 0.96, 'colsample_bytree': 0.29, 'max_depth': 16, 'gamma': 9.1, 'tree_method': 'gpu_hist', 'reg_lambda': 0.7416325124479994, 'reg_alpha': 1.596775200253793e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.8228238568178674}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5878
The average ROC is 0.5877740134712103


[32m[I 2022-08-20 16:47:37,407][0m Trial 37 finished with value: 0.5874545213665521 and parameters: {'n_estimators': 4000, 'learning_rate': 0.08592301501792782, 'subsample': 0.94, 'colsample_bytree': 0.31, 'max_depth': 16, 'gamma': 10.100000000000001, 'tree_method': 'gpu_hist', 'reg_lambda': 0.6828294963053123, 'reg_alpha': 4.479464262929966e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.2752798743411549}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5875
The average ROC is 0.5874545213665521


[32m[I 2022-08-20 16:47:43,790][0m Trial 38 finished with value: 0.5831956637597142 and parameters: {'n_estimators': 4000, 'learning_rate': 0.08245342703885924, 'subsample': 0.96, 'colsample_bytree': 0.28, 'max_depth': 17, 'gamma': 23.8, 'tree_method': 'gpu_hist', 'reg_lambda': 0.7828904787926969, 'reg_alpha': 9.095326426386504e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.379489466050484}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5832
The average ROC is 0.5831956637597142


[32m[I 2022-08-20 16:47:48,653][0m Trial 39 finished with value: 0.5881453198534244 and parameters: {'n_estimators': 4100, 'learning_rate': 0.16045577736737998, 'subsample': 0.23, 'colsample_bytree': 0.31, 'max_depth': 16, 'gamma': 20.400000000000002, 'tree_method': 'gpu_hist', 'reg_lambda': 0.5384274789933815, 'reg_alpha': 0.7995407743968908, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.47836304846712857}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5881
The average ROC is 0.5881453198534244


[32m[I 2022-08-20 16:47:55,486][0m Trial 40 finished with value: 0.5821411365718971 and parameters: {'n_estimators': 4300, 'learning_rate': 0.14998048203495187, 'subsample': 0.2, 'colsample_bytree': 0.15000000000000002, 'max_depth': 17, 'gamma': 22.6, 'tree_method': 'gpu_hist', 'reg_lambda': 11.512023583802609, 'reg_alpha': 3.7896674421019085, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.5676522550091337}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5821
The average ROC is 0.5821411365718971


[32m[I 2022-08-20 16:48:00,037][0m Trial 41 finished with value: 0.5863914881060515 and parameters: {'n_estimators': 4000, 'learning_rate': 0.1112825426729048, 'subsample': 0.95, 'colsample_bytree': 0.32, 'max_depth': 16, 'gamma': 11.3, 'tree_method': 'gpu_hist', 'reg_lambda': 0.6058072973711851, 'reg_alpha': 0.3523252110243227, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.11844895650498603}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5864
The average ROC is 0.5863914881060515


[32m[I 2022-08-20 16:48:03,316][0m Trial 42 finished with value: 0.5866394438885247 and parameters: {'n_estimators': 3900, 'learning_rate': 0.18455003926200084, 'subsample': 0.23, 'colsample_bytree': 0.33999999999999997, 'max_depth': 15, 'gamma': 18.8, 'tree_method': 'gpu_hist', 'reg_lambda': 0.1252082136692619, 'reg_alpha': 0.00910007798593504, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.2629171858139085}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5866
The average ROC is 0.5866394438885247


[32m[I 2022-08-20 16:48:14,585][0m Trial 43 finished with value: 0.5822197236504952 and parameters: {'n_estimators': 4300, 'learning_rate': 0.13185108580645116, 'subsample': 0.33, 'colsample_bytree': 0.2, 'max_depth': 18, 'gamma': 26.3, 'tree_method': 'gpu_hist', 'reg_lambda': 0.7853438109581076, 'reg_alpha': 5.104232632813383e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 5.408354860617551}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5822
The average ROC is 0.5822197236504952


[32m[I 2022-08-20 16:49:03,992][0m Trial 44 finished with value: 0.5829695628129556 and parameters: {'n_estimators': 4800, 'learning_rate': 0.09503378097627185, 'subsample': 0.89, 'colsample_bytree': 0.46, 'max_depth': 20, 'gamma': 32.4, 'tree_method': 'gpu_hist', 'reg_lambda': 0.12186472260854235, 'reg_alpha': 0.39409570911345987, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.6620296023109208}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5830
The average ROC is 0.5829695628129556


[32m[I 2022-08-20 16:49:06,676][0m Trial 45 finished with value: 0.5888286912089864 and parameters: {'n_estimators': 3500, 'learning_rate': 0.06468699030404264, 'subsample': 0.99, 'colsample_bytree': 0.19, 'max_depth': 14, 'gamma': 5.4, 'tree_method': 'gpu_hist', 'reg_lambda': 3.971455308758279, 'reg_alpha': 1.0923901722642057e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.5120525938147076}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5888
The average ROC is 0.5888286912089864


[32m[I 2022-08-20 16:49:09,106][0m Trial 46 finished with value: 0.586503296542269 and parameters: {'n_estimators': 3600, 'learning_rate': 0.17485177601954408, 'subsample': 1.0, 'colsample_bytree': 0.21000000000000002, 'max_depth': 14, 'gamma': 4.9, 'tree_method': 'gpu_hist', 'reg_lambda': 22.175626901377328, 'reg_alpha': 0.0762708339217669, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.1329074557282}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5865
The average ROC is 0.586503296542269


[32m[I 2022-08-20 16:49:12,177][0m Trial 47 finished with value: 0.5756255852629657 and parameters: {'n_estimators': 4200, 'learning_rate': 0.06966443024632339, 'subsample': 0.18, 'colsample_bytree': 0.13, 'max_depth': 14, 'gamma': 40.2, 'tree_method': 'gpu_hist', 'reg_lambda': 4.183799242135671, 'reg_alpha': 9.665318015172426e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 4.615773692445503}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5756
The average ROC is 0.5756255852629657


[32m[I 2022-08-20 16:49:16,491][0m Trial 48 finished with value: 0.581250156822874 and parameters: {'n_estimators': 4600, 'learning_rate': 0.21673270605082062, 'subsample': 0.26, 'colsample_bytree': 0.24, 'max_depth': 15, 'gamma': 19.200000000000003, 'tree_method': 'gpu_hist', 'reg_lambda': 0.2592442660493794, 'reg_alpha': 6.839729271623993, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 12.329955173559611}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5813
The average ROC is 0.581250156822874


[32m[I 2022-08-20 16:49:18,915][0m Trial 49 finished with value: 0.5813604346678416 and parameters: {'n_estimators': 3400, 'learning_rate': 0.04074267771222069, 'subsample': 0.83, 'colsample_bytree': 0.6200000000000001, 'max_depth': 13, 'gamma': 6.1000000000000005, 'tree_method': 'gpu_hist', 'reg_lambda': 1.8064842200164095, 'reg_alpha': 3.3433718428559095e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.2907644014914249}. Best is trial 14 with value: 0.589091802353999.[0m


The ROC is 0.5814
The average ROC is 0.5813604346678416


In [16]:
from joblib import dump, load

In [17]:
dump(study, "study.pkl")

['study.pkl']

In [18]:
dump(study.best_trial.params, "xgb_params.pkl")

['xgb_params.pkl']

## Ideas

In [19]:
# Change fit_transform in the feature engineering process to transform for the test set
# Understand why he is doing the aggregations
# Understand when the to_frame method is required when using an encoder