In [1]:
import optuna
import os
from pathlib import Path
import kaggle
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 2
OUTPUT = f'xg_boost_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), StandardScaler())

In [6]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }

    pipeline = make_pipeline(partial_pipeline, XGBClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-16 12:42:18,557] A new study created in memory with name: no-name-f4c71de1-5ad0-48ff-9478-13b133c77dda
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.785572:   3%|█████▍                                                                                                                                                              | 1/30 [00:11<05:36, 11.61s/it]

[I 2023-10-16 12:42:30,165] Trial 0 finished with value: 0.7855720906130914 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.05213714216109755, 'alpha': 0.06595742332292188, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.012, 'n_estimators': 200, 'max_depth': 9, 'min_child_weight': 67}. Best is trial 0 with value: 0.7855720906130914.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:42:35,758] Trial 1 finished with value: 0.7844863144146226 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.003934450513623756, 'alpha': 0.0031407595118928284, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.018, 'n_estimators': 200, 'max_depth': 5, 'min_child_weight': 292}. Best is trial 0 with value: 0.7855720906130914.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:42:47,876] Trial 2 finished with value: 0.7855652407402001 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.255475616290268, 'alpha': 0.06124854698368998, 'colsample_bytree': 0.5, 'subsample': 0.4, 'learning_rate': 0.018, 'n_estimators': 400, 'max_depth': 9, 'min_child_weight': 123}. Best is trial 0 with value: 0.7855720906130914.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:43:03,723] Trial 3 finished with value: 0.7857577277242112 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 9.67428032152217, 'alpha': 0.03153967582691633, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.018, 'n_estimators': 400, 'max_depth': 15, 'min_child_weight': 156}. Best is trial 3 with value: 0.7857577277242112.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:43:18,477] Trial 4 finished with value: 0.7859677377331735 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.009271351158230225, 'alpha': 0.009901585995600069, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.02, 'n_estimators': 400, 'max_depth': 13, 'min_child_weight': 146}. Best is trial 4 with value: 0.7859677377331735.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:43:27,822] Trial 5 finished with value: 0.7856861490757179 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 1.8078742810286728, 'alpha': 1.2375100915496653, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 200, 'max_depth': 9, 'min_child_weight': 20}. Best is trial 4 with value: 0.7859677377331735.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:43:43,260] Trial 6 finished with value: 0.7859703024837056 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.16761455878497458, 'alpha': 2.249350452400159, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.018, 'n_estimators': 500, 'max_depth': 17, 'min_child_weight': 229}. Best is trial 6 with value: 0.7859703024837056.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:44:02,243] Trial 7 finished with value: 0.7860907075351513 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.02457217171423184, 'alpha': 0.0632339970343539, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 7, 'min_child_weight': 179}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:44:42,318] Trial 8 finished with value: 0.7855114462742937 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 1.02216095228167, 'alpha': 0.1417411832466855, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.018, 'n_estimators': 500, 'max_depth': 17, 'min_child_weight': 86}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:45:10,113] Trial 9 finished with value: 0.7844951350464365 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.024906188191544578, 'alpha': 1.182887827909842, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.008, 'n_estimators': 400, 'max_depth': 17, 'min_child_weight': 163}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:45:26,662] Trial 10 finished with value: 0.7841684497103067 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.0014175901822772287, 'alpha': 0.0010637411835362908, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 7, 'min_child_weight': 230}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:45:41,822] Trial 11 finished with value: 0.7856243168632503 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.13031696389394712, 'alpha': 7.858388016839567, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 7, 'min_child_weight': 220}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:46:01,104] Trial 12 finished with value: 0.7858349801897317 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.02249639468782535, 'alpha': 0.33431964285517257, 'colsample_bytree': 0.6, 'subsample': 0.7, 'learning_rate': 0.014, 'n_estimators': 600, 'max_depth': 11, 'min_child_weight': 219}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:46:13,717] Trial 13 finished with value: 0.7843574090452642 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.1294910866437292, 'alpha': 0.015255951834416423, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 500, 'max_depth': 7, 'min_child_weight': 287}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:46:29,324] Trial 14 finished with value: 0.7850914276923114 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.010093695565552922, 'alpha': 0.29248514604299997, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.008, 'n_estimators': 500, 'max_depth': 17, 'min_child_weight': 199}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:46:32,132] Trial 15 finished with value: 0.7833899404552167 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.37485633697610987, 'alpha': 8.844618573300371, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.02, 'n_estimators': 100, 'max_depth': 11, 'min_child_weight': 259}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:46:42,367] Trial 16 finished with value: 0.7841182063213263 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.04995139471310907, 'alpha': 1.5261628714632844, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.012, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 191}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:47:01,457] Trial 17 finished with value: 0.7858668693167901 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.061247033964534146, 'alpha': 0.19489968895050627, 'colsample_bytree': 0.6, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 600, 'max_depth': 13, 'min_child_weight': 255}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:47:14,575] Trial 18 finished with value: 0.7857709591706442 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.001058147550488858, 'alpha': 0.5974791154914781, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.014, 'n_estimators': 300, 'max_depth': 15, 'min_child_weight': 113}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:47:32,160] Trial 19 finished with value: 0.7860041453019879 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.2445410931644113, 'alpha': 3.113786583073261, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 500, 'max_depth': 17, 'min_child_weight': 183}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:47:41,525] Trial 20 finished with value: 0.7851953157899582 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.38288024314391556, 'alpha': 0.09087262544158545, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 300, 'max_depth': 7, 'min_child_weight': 189}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:47:58,474] Trial 21 finished with value: 0.7860543135427603 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.14658067187290522, 'alpha': 3.1753223812349387, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 500, 'max_depth': 17, 'min_child_weight': 175}. Best is trial 7 with value: 0.7860907075351513.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:48:19,290] Trial 22 finished with value: 0.7861869236556351 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.5511528527586397, 'alpha': 3.7033440158558593, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 17, 'min_child_weight': 171}. Best is trial 22 with value: 0.7861869236556351.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:48:42,197] Trial 23 finished with value: 0.786212198929492 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.7129071304242561, 'alpha': 4.550425780353752, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 17, 'min_child_weight': 127}. Best is trial 23 with value: 0.786212198929492.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:49:07,764] Trial 24 finished with value: 0.7859648094205678 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.8180601146101104, 'alpha': 0.4441814726101145, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 17, 'min_child_weight': 131}. Best is trial 23 with value: 0.786212198929492.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 23. Best value: 0.786212:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [07:05<01:17, 19.28s/it]

[I 2023-10-16 12:49:24,263] Trial 25 finished with value: 0.7855430562115623 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 2.743432006547108, 'alpha': 0.6575461473254753, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 7, 'min_child_weight': 96}. Best is trial 23 with value: 0.786212198929492.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:49:55,255] Trial 26 finished with value: 0.7856913566412649 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.4982059001164402, 'alpha': 4.580298506996315, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 17, 'min_child_weight': 64}. Best is trial 23 with value: 0.786212198929492.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:50:15,600] Trial 27 finished with value: 0.7863017151419991 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.694965168321276, 'alpha': 5.715473584501093, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 11, 'min_child_weight': 142}. Best is trial 27 with value: 0.7863017151419991.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-16 12:50:37,061] Trial 28 finished with value: 0.7858613722773914 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 0.7415974932786953, 'alpha': 5.510419245031485, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.01, 'n_estimators': 600, 'max_depth': 11, 'min_child_weight': 139}. Best is trial 27 with value: 0.7863017151419991.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 27. Best value: 0.786302: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [08:23<00:00, 16.78s/it]

[I 2023-10-16 12:50:41,834] Trial 29 finished with value: 0.7835906451463608 and parameters: {'n_jobs': -1, 'random_state': 42, 'lambda': 1.7459852323408172, 'alpha': 6.010406502170702, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.012, 'n_estimators': 100, 'max_depth': 11, 'min_child_weight': 34}. Best is trial 27 with value: 0.7863017151419991.





In [7]:
study.best_params, study.best_value

({'n_jobs': -1,
  'random_state': 42,
  'lambda': 0.694965168321276,
  'alpha': 5.715473584501093,
  'colsample_bytree': 0.9,
  'subsample': 0.8,
  'learning_rate': 0.01,
  'n_estimators': 600,
  'max_depth': 11,
  'min_child_weight': 142},
 0.7863017151419991)

In [8]:
pipeline = make_pipeline(partial_pipeline, XGBClassifier(**study.best_params))
pipeline.fit(X, y)

  if is_sparse(data):


In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)