In [1]:
import optuna
import os
from pathlib import Path
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingClassifier


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 2
OUTPUT = f'hist_grad_boost_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
def objective(trial):
    params = {
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_regularization': trial.suggest_int('l2_regularization', 0, 100),
        'max_iter': trial.suggest_int('max_iter', 10, 10_000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 500),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 300),
        'max_bins': trial.suggest_int('max_bins', 32, 255),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
    }
    pipeline = make_pipeline(partial_pipeline, HistGradientBoostingClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-16 14:02:22,834] A new study created in memory with name: no-name-461c84c4-b511-42bf-b2a2-f2815ad108c6
Best trial: 0. Best value: 0.783936:   3%|█████▍                                                                                                                                                              | 1/30 [00:24<11:53, 24.59s/it]

[I 2023-10-16 14:02:47,420] Trial 0 finished with value: 0.7839357596386731 and parameters: {'random_state': 42, 'learning_rate': 0.0162891722738249, 'l2_regularization': 72, 'max_iter': 5659, 'max_leaf_nodes': 333, 'min_samples_leaf': 16, 'max_bins': 147, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7839357596386731.


Best trial: 1. Best value: 0.784494:   7%|██████████▉                                                                                                                                                         | 2/30 [00:44<10:13, 21.91s/it]

[I 2023-10-16 14:03:07,455] Trial 1 finished with value: 0.784493803378133 and parameters: {'random_state': 42, 'learning_rate': 0.022172329769311373, 'l2_regularization': 63, 'max_iter': 4526, 'max_leaf_nodes': 486, 'min_samples_leaf': 129, 'max_bins': 206, 'class_weight': None}. Best is trial 1 with value: 0.784493803378133.


Best trial: 1. Best value: 0.784494:  10%|████████████████▍                                                                                                                                                   | 3/30 [01:09<10:32, 23.43s/it]

[I 2023-10-16 14:03:32,685] Trial 2 finished with value: 0.783169257764708 and parameters: {'random_state': 42, 'learning_rate': 0.0157357956330885, 'l2_regularization': 31, 'max_iter': 4160, 'max_leaf_nodes': 391, 'min_samples_leaf': 127, 'max_bins': 185, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.784493803378133.


Best trial: 1. Best value: 0.784494:  13%|█████████████████████▊                                                                                                                                              | 4/30 [01:21<08:12, 18.93s/it]

[I 2023-10-16 14:03:44,720] Trial 3 finished with value: 0.7839365908773964 and parameters: {'random_state': 42, 'learning_rate': 0.027627200381734544, 'l2_regularization': 18, 'max_iter': 7555, 'max_leaf_nodes': 224, 'min_samples_leaf': 222, 'max_bins': 89, 'class_weight': None}. Best is trial 1 with value: 0.784493803378133.


Best trial: 4. Best value: 0.78489:  17%|███████████████████████████▌                                                                                                                                         | 5/30 [01:36<07:15, 17.42s/it]

[I 2023-10-16 14:03:59,464] Trial 4 finished with value: 0.7848897040412556 and parameters: {'random_state': 42, 'learning_rate': 0.021559616756989697, 'l2_regularization': 69, 'max_iter': 963, 'max_leaf_nodes': 192, 'min_samples_leaf': 275, 'max_bins': 172, 'class_weight': None}. Best is trial 4 with value: 0.7848897040412556.


Best trial: 5. Best value: 0.785261:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [01:50<06:29, 16.23s/it]

[I 2023-10-16 14:04:13,385] Trial 5 finished with value: 0.7852607508995011 and parameters: {'random_state': 42, 'learning_rate': 0.014681469842378201, 'l2_regularization': 7, 'max_iter': 1834, 'max_leaf_nodes': 53, 'min_samples_leaf': 233, 'max_bins': 173, 'class_weight': None}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 5. Best value: 0.785261:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [02:06<06:09, 16.08s/it]

[I 2023-10-16 14:04:29,157] Trial 6 finished with value: 0.7837187351740568 and parameters: {'random_state': 42, 'learning_rate': 0.02293657364519678, 'l2_regularization': 51, 'max_iter': 5350, 'max_leaf_nodes': 276, 'min_samples_leaf': 184, 'max_bins': 118, 'class_weight': 'balanced'}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 5. Best value: 0.785261:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [02:22<05:52, 16.04s/it]

[I 2023-10-16 14:04:45,119] Trial 7 finished with value: 0.782314586818775 and parameters: {'random_state': 42, 'learning_rate': 0.022616082195518434, 'l2_regularization': 31, 'max_iter': 5791, 'max_leaf_nodes': 417, 'min_samples_leaf': 67, 'max_bins': 62, 'class_weight': 'balanced'}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 5. Best value: 0.785261:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [02:30<04:47, 13.70s/it]

[I 2023-10-16 14:04:53,683] Trial 8 finished with value: 0.7809133295066281 and parameters: {'random_state': 42, 'learning_rate': 0.05520214582722401, 'l2_regularization': 2, 'max_iter': 6975, 'max_leaf_nodes': 366, 'min_samples_leaf': 201, 'max_bins': 195, 'class_weight': 'balanced'}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 5. Best value: 0.785261:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [02:38<03:57, 11.89s/it]

[I 2023-10-16 14:05:01,499] Trial 9 finished with value: 0.7849234721708109 and parameters: {'random_state': 42, 'learning_rate': 0.06519806259478182, 'l2_regularization': 97, 'max_iter': 4739, 'max_leaf_nodes': 202, 'min_samples_leaf': 251, 'max_bins': 204, 'class_weight': None}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 5. Best value: 0.785261:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [02:43<03:06,  9.79s/it]

[I 2023-10-16 14:05:06,539] Trial 10 finished with value: 0.7817604374735146 and parameters: {'random_state': 42, 'learning_rate': 0.01054884176630845, 'l2_regularization': 1, 'max_iter': 130, 'max_leaf_nodes': 7, 'min_samples_leaf': 290, 'max_bins': 232, 'class_weight': None}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 5. Best value: 0.785261:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 12/30 [02:49<02:37,  8.72s/it]

[I 2023-10-16 14:05:12,816] Trial 11 finished with value: 0.7849964183306113 and parameters: {'random_state': 42, 'learning_rate': 0.09753619824344716, 'l2_regularization': 100, 'max_iter': 2604, 'max_leaf_nodes': 109, 'min_samples_leaf': 242, 'max_bins': 254, 'class_weight': None}. Best is trial 5 with value: 0.7852607508995011.


Best trial: 12. Best value: 0.785432:  43%|██████████████████████████████████████████████████████████████████████▏                                                                                           | 13/30 [02:55<02:11,  7.76s/it]

[I 2023-10-16 14:05:18,364] Trial 12 finished with value: 0.7854322682658073 and parameters: {'random_state': 42, 'learning_rate': 0.08995841281954081, 'l2_regularization': 99, 'max_iter': 2389, 'max_leaf_nodes': 65, 'min_samples_leaf': 234, 'max_bins': 248, 'class_weight': None}. Best is trial 12 with value: 0.7854322682658073.


Best trial: 13. Best value: 0.785761:  47%|███████████████████████████████████████████████████████████████████████████▌                                                                                      | 14/30 [03:03<02:07,  7.96s/it]

[I 2023-10-16 14:05:26,783] Trial 13 finished with value: 0.7857610426557388 and parameters: {'random_state': 42, 'learning_rate': 0.040108059766439975, 'l2_regularization': 88, 'max_iter': 9815, 'max_leaf_nodes': 45, 'min_samples_leaf': 175, 'max_bins': 149, 'class_weight': None}. Best is trial 13 with value: 0.7857610426557388.


Best trial: 13. Best value: 0.785761:  50%|█████████████████████████████████████████████████████████████████████████████████                                                                                 | 15/30 [03:12<02:04,  8.29s/it]

[I 2023-10-16 14:05:35,830] Trial 14 finished with value: 0.7848090483240877 and parameters: {'random_state': 42, 'learning_rate': 0.041280282384122874, 'l2_regularization': 84, 'max_iter': 8756, 'max_leaf_nodes': 123, 'min_samples_leaf': 172, 'max_bins': 127, 'class_weight': None}. Best is trial 13 with value: 0.7857610426557388.


Best trial: 13. Best value: 0.785761:  53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 16/30 [03:22<01:59,  8.55s/it]

[I 2023-10-16 14:05:44,981] Trial 15 finished with value: 0.7847085928607795 and parameters: {'random_state': 42, 'learning_rate': 0.0394687900922766, 'l2_regularization': 85, 'max_iter': 9469, 'max_leaf_nodes': 125, 'min_samples_leaf': 96, 'max_bins': 32, 'class_weight': None}. Best is trial 13 with value: 0.7857610426557388.


Best trial: 16. Best value: 0.785932:  57%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 17/30 [03:27<01:37,  7.52s/it]

[I 2023-10-16 14:05:50,125] Trial 16 finished with value: 0.7859319416745161 and parameters: {'random_state': 42, 'learning_rate': 0.09654101776909114, 'l2_regularization': 86, 'max_iter': 2557, 'max_leaf_nodes': 24, 'min_samples_leaf': 153, 'max_bins': 248, 'class_weight': None}. Best is trial 16 with value: 0.7859319416745161.


Best trial: 16. Best value: 0.785932:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [03:33<01:24,  7.01s/it]

[I 2023-10-16 14:05:55,946] Trial 17 finished with value: 0.7857672347363003 and parameters: {'random_state': 42, 'learning_rate': 0.06905101263531387, 'l2_regularization': 81, 'max_iter': 3201, 'max_leaf_nodes': 37, 'min_samples_leaf': 142, 'max_bins': 94, 'class_weight': None}. Best is trial 16 with value: 0.7859319416745161.


Best trial: 18. Best value: 0.786057:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [03:39<01:16,  6.93s/it]

[I 2023-10-16 14:06:02,693] Trial 18 finished with value: 0.7860567384187178 and parameters: {'random_state': 42, 'learning_rate': 0.07683471231734847, 'l2_regularization': 56, 'max_iter': 3536, 'max_leaf_nodes': 10, 'min_samples_leaf': 67, 'max_bins': 99, 'class_weight': None}. Best is trial 18 with value: 0.7860567384187178.


Best trial: 18. Best value: 0.786057:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [03:46<01:07,  6.70s/it]

[I 2023-10-16 14:06:08,867] Trial 19 finished with value: 0.7843388613886046 and parameters: {'random_state': 42, 'learning_rate': 0.09977947014378354, 'l2_regularization': 51, 'max_iter': 3239, 'max_leaf_nodes': 161, 'min_samples_leaf': 29, 'max_bins': 81, 'class_weight': None}. Best is trial 18 with value: 0.7860567384187178.


Best trial: 18. Best value: 0.786057:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [03:53<01:01,  6.88s/it]

[I 2023-10-16 14:06:16,154] Trial 20 finished with value: 0.7841549454610615 and parameters: {'random_state': 42, 'learning_rate': 0.07955765346790535, 'l2_regularization': 60, 'max_iter': 1181, 'max_leaf_nodes': 264, 'min_samples_leaf': 70, 'max_bins': 120, 'class_weight': None}. Best is trial 18 with value: 0.7860567384187178.


Best trial: 18. Best value: 0.786057:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [03:58<00:51,  6.49s/it]

[I 2023-10-16 14:06:21,727] Trial 21 finished with value: 0.7859699092682494 and parameters: {'random_state': 42, 'learning_rate': 0.06588012600350415, 'l2_regularization': 78, 'max_iter': 3581, 'max_leaf_nodes': 11, 'min_samples_leaf': 138, 'max_bins': 92, 'class_weight': None}. Best is trial 18 with value: 0.7860567384187178.


Best trial: 18. Best value: 0.786057:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [04:05<00:45,  6.44s/it]

[I 2023-10-16 14:06:28,049] Trial 22 finished with value: 0.7859160234748312 and parameters: {'random_state': 42, 'learning_rate': 0.05741396517406322, 'l2_regularization': 74, 'max_iter': 3371, 'max_leaf_nodes': 7, 'min_samples_leaf': 102, 'max_bins': 65, 'class_weight': None}. Best is trial 18 with value: 0.7860567384187178.


Best trial: 18. Best value: 0.786057:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [04:10<00:37,  6.21s/it]

[I 2023-10-16 14:06:33,724] Trial 23 finished with value: 0.7849715281802224 and parameters: {'random_state': 42, 'learning_rate': 0.07969797534747658, 'l2_regularization': 41, 'max_iter': 3829, 'max_leaf_nodes': 89, 'min_samples_leaf': 54, 'max_bins': 112, 'class_weight': None}. Best is trial 18 with value: 0.7860567384187178.


Best trial: 24. Best value: 0.786168:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [04:16<00:30,  6.08s/it]

[I 2023-10-16 14:06:39,489] Trial 24 finished with value: 0.7861680221392143 and parameters: {'random_state': 42, 'learning_rate': 0.07765831939737257, 'l2_regularization': 60, 'max_iter': 2456, 'max_leaf_nodes': 8, 'min_samples_leaf': 98, 'max_bins': 45, 'class_weight': None}. Best is trial 24 with value: 0.7861680221392143.


Best trial: 24. Best value: 0.786168:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [04:23<00:24,  6.18s/it]

[I 2023-10-16 14:06:45,913] Trial 25 finished with value: 0.7849606958178199 and parameters: {'random_state': 42, 'learning_rate': 0.051271556646063615, 'l2_regularization': 61, 'max_iter': 1617, 'max_leaf_nodes': 80, 'min_samples_leaf': 96, 'max_bins': 48, 'class_weight': None}. Best is trial 24 with value: 0.7861680221392143.


Best trial: 24. Best value: 0.786168:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [04:28<00:18,  6.09s/it]

[I 2023-10-16 14:06:51,776] Trial 26 finished with value: 0.784211227089501 and parameters: {'random_state': 42, 'learning_rate': 0.06644067716176481, 'l2_regularization': 44, 'max_iter': 97, 'max_leaf_nodes': 158, 'min_samples_leaf': 42, 'max_bins': 32, 'class_weight': 'balanced'}. Best is trial 24 with value: 0.7861680221392143.


Best trial: 24. Best value: 0.786168:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [04:34<00:11,  5.82s/it]

[I 2023-10-16 14:06:56,976] Trial 27 finished with value: 0.7859397829690407 and parameters: {'random_state': 42, 'learning_rate': 0.07816505625944993, 'l2_regularization': 66, 'max_iter': 3718, 'max_leaf_nodes': 10, 'min_samples_leaf': 81, 'max_bins': 100, 'class_weight': None}. Best is trial 24 with value: 0.7861680221392143.


Best trial: 24. Best value: 0.786168:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [04:41<00:06,  6.41s/it]

[I 2023-10-16 14:07:04,750] Trial 28 finished with value: 0.7852070919804187 and parameters: {'random_state': 42, 'learning_rate': 0.047327583575388604, 'l2_regularization': 77, 'max_iter': 6470, 'max_leaf_nodes': 82, 'min_samples_leaf': 118, 'max_bins': 72, 'class_weight': None}. Best is trial 24 with value: 0.7861680221392143.


Best trial: 24. Best value: 0.786168: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [04:50<00:00,  9.70s/it]

[I 2023-10-16 14:07:13,725] Trial 29 finished with value: 0.7834244287610251 and parameters: {'random_state': 42, 'learning_rate': 0.06221232924690536, 'l2_regularization': 56, 'max_iter': 5724, 'max_leaf_nodes': 295, 'min_samples_leaf': 13, 'max_bins': 146, 'class_weight': 'balanced'}. Best is trial 24 with value: 0.7861680221392143.





In [7]:
study.best_params, study.best_value

({'random_state': 42,
  'learning_rate': 0.07765831939737257,
  'l2_regularization': 60,
  'max_iter': 2456,
  'max_leaf_nodes': 8,
  'min_samples_leaf': 98,
  'max_bins': 45,
  'class_weight': None},
 0.7861680221392143)

In [8]:
pipeline = make_pipeline(partial_pipeline, HistGradientBoostingClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)