In [1]:
import optuna
import os
from pathlib import Path
import kaggle
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 4
OUTPUT = f'xg_boost_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(ColumnTransformer([('drop', 'drop',
                                  ['iv(g)', 't', 'b', 'n',
                                   'lOCode', 'v', 'branchCount',
                                   'e', 'i', 'lOComment'])],
                                remainder='passthrough'), SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }

    pipeline = make_pipeline(partial_pipeline, XGBClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-16 13:09:34,144] A new study created in memory with name: no-name-062b82a3-978f-4666-9222-95ca8d40ee9a
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.785645:   3%|█████▍                                                                                                                                                              | 1/30 [00:15<07:24, 15.31s/it]

[I 2023-10-16 13:09:49,460] Trial 0 finished with value: 0.7856445715720518 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.0073988065515872455, 'alpha': 0.002483275813687532, 'colsample_bytree': 0.5, 'subsample': 0.4, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 11, 'min_child_weight': 86}. Best is trial 0 with value: 0.7856445715720518.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.785645:   7%|██████████▉                                                                                                                                                         | 2/30 [00:28<06:40, 14.32s/it]

[I 2023-10-16 13:10:03,087] Trial 1 finished with value: 0.781429113270524 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 5.435766209871165, 'alpha': 0.006899644864867249, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.008, 'n_estimators': 400, 'max_depth': 15, 'min_child_weight': 88}. Best is trial 0 with value: 0.7856445715720518.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  10%|████████████████▍                                                                                                                                                   | 3/30 [00:35<04:49, 10.70s/it]

[I 2023-10-16 13:10:09,486] Trial 2 finished with value: 0.7861217186218017 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.5076643359572762, 'alpha': 0.003170047051704783, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 76}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  13%|█████████████████████▊                                                                                                                                              | 4/30 [00:43<04:17,  9.90s/it]

[I 2023-10-16 13:10:18,144] Trial 3 finished with value: 0.7858847126338369 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.009037225483863997, 'alpha': 0.003950893642411191, 'colsample_bytree': 0.7, 'subsample': 0.4, 'learning_rate': 0.014, 'n_estimators': 400, 'max_depth': 5, 'min_child_weight': 42}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [00:48<03:20,  8.01s/it]

[I 2023-10-16 13:10:22,809] Trial 4 finished with value: 0.7821948014874596 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 1.0125573004972601, 'alpha': 8.4088749603494, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.014, 'n_estimators': 100, 'max_depth': 9, 'min_child_weight': 255}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [01:05<04:27, 11.15s/it]

[I 2023-10-16 13:10:40,062] Trial 5 finished with value: 0.7854085932742629 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 3.203462065753975, 'alpha': 1.4370488544190383, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.012, 'n_estimators': 600, 'max_depth': 13, 'min_child_weight': 184}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [01:14<03:55, 10.23s/it]

[I 2023-10-16 13:10:48,406] Trial 6 finished with value: 0.7845009840008503 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.008795761124625774, 'alpha': 0.9225552644824201, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.018, 'n_estimators': 300, 'max_depth': 9, 'min_child_weight': 224}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [01:30<04:26, 12.13s/it]

[I 2023-10-16 13:11:04,583] Trial 7 finished with value: 0.785406257071472 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.31616084879818246, 'alpha': 0.2042434578293148, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.01, 'n_estimators': 500, 'max_depth': 9, 'min_child_weight': 143}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [01:45<04:36, 13.16s/it]

[I 2023-10-16 13:11:20,012] Trial 8 finished with value: 0.7846872565444606 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.006515853985205676, 'alpha': 1.4405159714898865, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.008, 'n_estimators': 300, 'max_depth': 17, 'min_child_weight': 61}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [01:50<03:33, 10.68s/it]

[I 2023-10-16 13:11:25,130] Trial 9 finished with value: 0.7793743835004856 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.1969299073598506, 'alpha': 0.6698315726758519, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.016, 'n_estimators': 200, 'max_depth': 5, 'min_child_weight': 285}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [01:54<02:43,  8.62s/it]

[I 2023-10-16 13:11:29,075] Trial 10 finished with value: 0.7853859254825837 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.03594643402790054, 'alpha': 0.01957782559066053, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 100, 'max_depth': 7, 'min_child_weight': 2}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 12/30 [02:05<02:43,  9.09s/it]

[I 2023-10-16 13:11:39,238] Trial 11 finished with value: 0.7859170037032406 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.0010847272935706324, 'alpha': 0.0013061838985911078, 'colsample_bytree': 0.8, 'subsample': 0.4, 'learning_rate': 0.014, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  43%|██████████████████████████████████████████████████████████████████████▋                                                                                            | 13/30 [02:22<03:19, 11.72s/it]

[I 2023-10-16 13:11:57,027] Trial 12 finished with value: 0.7859126379774427 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.0010586135134783344, 'alpha': 0.0014122939950142095, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 600, 'max_depth': 5, 'min_child_weight': 3}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 14/30 [02:34<03:07, 11.73s/it]

[I 2023-10-16 13:12:08,759] Trial 13 finished with value: 0.7856883254644162 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.07895796741137118, 'alpha': 0.018485638816005116, 'colsample_bytree': 0.8, 'subsample': 0.4, 'learning_rate': 0.014, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 139}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  50%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 15/30 [02:45<02:51, 11.44s/it]

[I 2023-10-16 13:12:19,543] Trial 14 finished with value: 0.7850342301523201 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.8748874204844022, 'alpha': 0.0010423979989540146, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.012, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 40}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  53%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 16/30 [02:55<02:33, 10.94s/it]

[I 2023-10-16 13:12:29,333] Trial 15 finished with value: 0.7851160137667711 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.05469028662666221, 'alpha': 0.009602516288706753, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 200, 'max_depth': 11, 'min_child_weight': 107}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  57%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 17/30 [03:16<03:01, 13.96s/it]

[I 2023-10-16 13:12:50,321] Trial 16 finished with value: 0.7841511135631242 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.33545902531492783, 'alpha': 0.0010278350077265307, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 300, 'max_depth': 17, 'min_child_weight': 31}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 18/30 [03:22<02:18, 11.52s/it]

[I 2023-10-16 13:12:56,146] Trial 17 finished with value: 0.7831717795554116 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.0013960600610228415, 'alpha': 0.03778728408560453, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.018, 'n_estimators': 200, 'max_depth': 7, 'min_child_weight': 114}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  63%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 19/30 [03:36<02:16, 12.41s/it]

[I 2023-10-16 13:13:10,637] Trial 18 finished with value: 0.7855630657003504 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.027061818749038513, 'alpha': 0.004487336951290934, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.014, 'n_estimators': 500, 'max_depth': 15, 'min_child_weight': 175}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 20/30 [03:58<02:32, 15.20s/it]

[I 2023-10-16 13:13:32,349] Trial 19 finished with value: 0.7854562125378676 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 9.630053091897405, 'alpha': 0.0021382287784449706, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.02, 'n_estimators': 600, 'max_depth': 13, 'min_child_weight': 71}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                 | 21/30 [04:07<01:59, 13.29s/it]

[I 2023-10-16 13:13:41,164] Trial 20 finished with value: 0.7860795871964933 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.12850919745868197, 'alpha': 0.003716106855893246, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 5, 'min_child_weight': 19}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 22/30 [04:19<01:44, 13.02s/it]

[I 2023-10-16 13:13:53,563] Trial 21 finished with value: 0.7861128841412505 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.14300900719501847, 'alpha': 0.003430062192411272, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 5, 'min_child_weight': 15}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                      | 23/30 [04:36<01:39, 14.24s/it]

[I 2023-10-16 13:14:10,631] Trial 22 finished with value: 0.7860705678282832 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.1491847438304203, 'alpha': 0.007882123147455612, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 5, 'min_child_weight': 31}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 24/30 [04:44<01:14, 12.44s/it]

[I 2023-10-16 13:14:18,888] Trial 23 finished with value: 0.7860506834757052 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.737305567808482, 'alpha': 0.00365804171989873, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 63}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 25/30 [04:54<00:58, 11.73s/it]

[I 2023-10-16 13:14:28,972] Trial 24 finished with value: 0.7860384857443572 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.10132178583156241, 'alpha': 0.026139145075615534, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 5, 'min_child_weight': 22}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  87%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 26/30 [05:01<00:41, 10.32s/it]

[I 2023-10-16 13:14:36,000] Trial 25 finished with value: 0.7860128639634048 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.4266667751852681, 'alpha': 0.011158213483791432, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 5, 'min_child_weight': 52}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 27/30 [05:07<00:26,  9.00s/it]

[I 2023-10-16 13:14:41,915] Trial 26 finished with value: 0.7857747683479717 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.162267298934516, 'alpha': 0.05018949927987163, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 200, 'max_depth': 5, 'min_child_weight': 106}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [05:29<00:25, 12.92s/it]

[I 2023-10-16 13:15:03,967] Trial 27 finished with value: 0.7857265278136186 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 1.759955842083293, 'alpha': 0.0026714207580845403, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 7, 'min_child_weight': 18}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [05:44<00:13, 13.51s/it]

[I 2023-10-16 13:15:18,868] Trial 28 finished with value: 0.785421659970495 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.5001251460120861, 'alpha': 0.006489011104608592, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 300, 'max_depth': 11, 'min_child_weight': 81}. Best is trial 2 with value: 0.7861217186218017.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 2. Best value: 0.786122: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [06:03<00:00, 12.11s/it]

[I 2023-10-16 13:15:37,444] Trial 29 finished with value: 0.7850167051503686 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.2696290431278074, 'alpha': 0.0024868653583331652, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 15, 'min_child_weight': 91}. Best is trial 2 with value: 0.7861217186218017.





In [7]:
study.best_params, study.best_value

({'n_jobs': -1,
  'random_state': 42,
  'lambda': 0.5076643359572762,
  'alpha': 0.003170047051704783,
  'colsample_bytree': 0.9,
  'subsample': 0.4,
  'learning_rate': 0.02,
  'n_estimators': 300,
  'max_depth': 5,
  'min_child_weight': 76},
 0.7861217186218017)

In [8]:
pipeline = make_pipeline(partial_pipeline, XGBClassifier(**study.best_params))
pipeline.fit(X, y)

  if is_sparse(data):


In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)