In [1]:
import optuna
import os
from pathlib import Path
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingClassifier


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 1
OUTPUT = f'hist_grad_boost_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), StandardScaler())

In [6]:
def objective(trial):
    params = {
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_regularization': trial.suggest_int('l2_regularization', 0, 100),
        'max_iter': trial.suggest_int('max_iter', 10, 10_000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 500),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 300),
        'max_bins': trial.suggest_int('max_bins', 32, 255),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
    }
    pipeline = make_pipeline(partial_pipeline, HistGradientBoostingClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-16 13:53:39,513] A new study created in memory with name: no-name-49a93b39-ed13-481d-aeea-7db2db0da55a
Best trial: 0. Best value: 0.78462:   3%|█████▌                                                                                                                                                               | 1/30 [00:13<06:37, 13.69s/it]

[I 2023-10-16 13:53:53,208] Trial 0 finished with value: 0.7846201158377332 and parameters: {'random_state': 42, 'learning_rate': 0.024118501272207507, 'l2_regularization': 94, 'max_iter': 5889, 'max_leaf_nodes': 268, 'min_samples_leaf': 60, 'max_bins': 102, 'class_weight': None}. Best is trial 0 with value: 0.7846201158377332.


Best trial: 1. Best value: 0.785043:   7%|██████████▉                                                                                                                                                         | 2/30 [00:40<09:57, 21.34s/it]

[I 2023-10-16 13:54:19,907] Trial 1 finished with value: 0.7850426672143341 and parameters: {'random_state': 42, 'learning_rate': 0.010039788539813715, 'l2_regularization': 98, 'max_iter': 5033, 'max_leaf_nodes': 280, 'min_samples_leaf': 211, 'max_bins': 229, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  10%|████████████████▍                                                                                                                                                   | 3/30 [00:59<09:10, 20.39s/it]

[I 2023-10-16 13:54:39,162] Trial 2 finished with value: 0.7841464771966695 and parameters: {'random_state': 42, 'learning_rate': 0.01169107325333102, 'l2_regularization': 29, 'max_iter': 8900, 'max_leaf_nodes': 177, 'min_samples_leaf': 131, 'max_bins': 153, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  13%|█████████████████████▊                                                                                                                                              | 4/30 [01:18<08:31, 19.67s/it]

[I 2023-10-16 13:54:57,721] Trial 3 finished with value: 0.7839597570701263 and parameters: {'random_state': 42, 'learning_rate': 0.01487192640822732, 'l2_regularization': 31, 'max_iter': 3262, 'max_leaf_nodes': 183, 'min_samples_leaf': 40, 'max_bins': 212, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [01:28<06:51, 16.46s/it]

[I 2023-10-16 13:55:08,491] Trial 4 finished with value: 0.7849195147019742 and parameters: {'random_state': 42, 'learning_rate': 0.02826035378046412, 'l2_regularization': 98, 'max_iter': 1696, 'max_leaf_nodes': 475, 'min_samples_leaf': 221, 'max_bins': 230, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [01:45<06:38, 16.59s/it]

[I 2023-10-16 13:55:25,333] Trial 5 finished with value: 0.7843511312439594 and parameters: {'random_state': 42, 'learning_rate': 0.023447144830017353, 'l2_regularization': 89, 'max_iter': 9569, 'max_leaf_nodes': 277, 'min_samples_leaf': 32, 'max_bins': 49, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [01:50<04:48, 12.56s/it]

[I 2023-10-16 13:55:29,589] Trial 6 finished with value: 0.7849327396095676 and parameters: {'random_state': 42, 'learning_rate': 0.06785305957390464, 'l2_regularization': 80, 'max_iter': 9275, 'max_leaf_nodes': 104, 'min_samples_leaf': 36, 'max_bins': 92, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [01:58<04:04, 11.13s/it]

[I 2023-10-16 13:55:37,660] Trial 7 finished with value: 0.7846277436022686 and parameters: {'random_state': 42, 'learning_rate': 0.03427637483437081, 'l2_regularization': 88, 'max_iter': 504, 'max_leaf_nodes': 128, 'min_samples_leaf': 102, 'max_bins': 94, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [02:06<03:35, 10.27s/it]

[I 2023-10-16 13:55:46,035] Trial 8 finished with value: 0.7842429655841642 and parameters: {'random_state': 42, 'learning_rate': 0.05069776288100092, 'l2_regularization': 83, 'max_iter': 131, 'max_leaf_nodes': 404, 'min_samples_leaf': 83, 'max_bins': 38, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 1. Best value: 0.785043:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [02:27<04:32, 13.61s/it]

[I 2023-10-16 13:56:07,130] Trial 9 finished with value: 0.7834165842897232 and parameters: {'random_state': 42, 'learning_rate': 0.01139361030791574, 'l2_regularization': 19, 'max_iter': 7801, 'max_leaf_nodes': 303, 'min_samples_leaf': 189, 'max_bins': 39, 'class_weight': None}. Best is trial 1 with value: 0.7850426672143341.


Best trial: 10. Best value: 0.785754:  37%|███████████████████████████████████████████████████████████▍                                                                                                      | 11/30 [02:37<03:58, 12.54s/it]

[I 2023-10-16 13:56:17,241] Trial 10 finished with value: 0.7857539412589267 and parameters: {'random_state': 42, 'learning_rate': 0.015945081720476505, 'l2_regularization': 56, 'max_iter': 5507, 'max_leaf_nodes': 30, 'min_samples_leaf': 291, 'max_bins': 185, 'class_weight': 'balanced'}. Best is trial 10 with value: 0.7857539412589267.


Best trial: 11. Best value: 0.786026:  40%|████████████████████████████████████████████████████████████████▊                                                                                                 | 12/30 [02:51<03:53, 12.98s/it]

[I 2023-10-16 13:56:31,240] Trial 11 finished with value: 0.7860260421824963 and parameters: {'random_state': 42, 'learning_rate': 0.010212498830869023, 'l2_regularization': 53, 'max_iter': 5167, 'max_leaf_nodes': 24, 'min_samples_leaf': 300, 'max_bins': 186, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  43%|██████████████████████████████████████████████████████████████████████▏                                                                                           | 13/30 [03:00<03:20, 11.79s/it]

[I 2023-10-16 13:56:40,289] Trial 12 finished with value: 0.7858985009020272 and parameters: {'random_state': 42, 'learning_rate': 0.016315523000296293, 'l2_regularization': 60, 'max_iter': 6559, 'max_leaf_nodes': 17, 'min_samples_leaf': 300, 'max_bins': 178, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  47%|███████████████████████████████████████████████████████████████████████████▌                                                                                      | 14/30 [03:09<02:55, 10.94s/it]

[I 2023-10-16 13:56:49,256] Trial 13 finished with value: 0.7859576233798067 and parameters: {'random_state': 42, 'learning_rate': 0.016226098983898513, 'l2_regularization': 59, 'max_iter': 6997, 'max_leaf_nodes': 14, 'min_samples_leaf': 296, 'max_bins': 170, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  50%|█████████████████████████████████████████████████████████████████████████████████                                                                                 | 15/30 [03:20<02:44, 10.94s/it]

[I 2023-10-16 13:57:00,213] Trial 14 finished with value: 0.7851472867127389 and parameters: {'random_state': 42, 'learning_rate': 0.01859763404171977, 'l2_regularization': 68, 'max_iter': 3415, 'max_leaf_nodes': 75, 'min_samples_leaf': 256, 'max_bins': 134, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 16/30 [03:32<02:37, 11.28s/it]

[I 2023-10-16 13:57:12,263] Trial 15 finished with value: 0.7857349197878502 and parameters: {'random_state': 42, 'learning_rate': 0.013358542014737084, 'l2_regularization': 43, 'max_iter': 7810, 'max_leaf_nodes': 8, 'min_samples_leaf': 256, 'max_bins': 192, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  57%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 17/30 [03:56<03:16, 15.08s/it]

[I 2023-10-16 13:57:36,191] Trial 16 finished with value: 0.7840282801406168 and parameters: {'random_state': 42, 'learning_rate': 0.010609397992015407, 'l2_regularization': 43, 'max_iter': 4012, 'max_leaf_nodes': 184, 'min_samples_leaf': 164, 'max_bins': 252, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [04:06<02:40, 13.38s/it]

[I 2023-10-16 13:57:45,624] Trial 17 finished with value: 0.7849862868875457 and parameters: {'random_state': 42, 'learning_rate': 0.017922790641526413, 'l2_regularization': 6, 'max_iter': 7010, 'max_leaf_nodes': 64, 'min_samples_leaf': 260, 'max_bins': 147, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [04:32<03:09, 17.20s/it]

[I 2023-10-16 13:58:11,709] Trial 18 finished with value: 0.7836950988128379 and parameters: {'random_state': 42, 'learning_rate': 0.013113240722474974, 'l2_regularization': 71, 'max_iter': 4347, 'max_leaf_nodes': 347, 'min_samples_leaf': 2, 'max_bins': 122, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [04:45<02:40, 16.05s/it]

[I 2023-10-16 13:58:25,096] Trial 19 finished with value: 0.7845067996497883 and parameters: {'random_state': 42, 'learning_rate': 0.020773298772317775, 'l2_regularization': 42, 'max_iter': 2421, 'max_leaf_nodes': 138, 'min_samples_leaf': 280, 'max_bins': 173, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [05:00<02:21, 15.68s/it]

[I 2023-10-16 13:58:39,907] Trial 20 finished with value: 0.7851950254318009 and parameters: {'random_state': 42, 'learning_rate': 0.014044388809565085, 'l2_regularization': 66, 'max_iter': 7914, 'max_leaf_nodes': 65, 'min_samples_leaf': 227, 'max_bins': 205, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [05:35<02:53, 21.63s/it]

[I 2023-10-16 13:59:15,409] Trial 21 finished with value: 0.7840562496284471 and parameters: {'random_state': 42, 'learning_rate': 0.016253236295270285, 'l2_regularization': 58, 'max_iter': 6186, 'max_leaf_nodes': 2, 'min_samples_leaf': 300, 'max_bins': 167, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [05:50<02:16, 19.47s/it]

[I 2023-10-16 13:59:29,834] Trial 22 finished with value: 0.7855991263630739 and parameters: {'random_state': 42, 'learning_rate': 0.013240801676577506, 'l2_regularization': 53, 'max_iter': 6941, 'max_leaf_nodes': 46, 'min_samples_leaf': 273, 'max_bins': 166, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [06:11<02:00, 20.01s/it]

[I 2023-10-16 13:59:51,103] Trial 23 finished with value: 0.7849492617917776 and parameters: {'random_state': 42, 'learning_rate': 0.010070115687816749, 'l2_regularization': 64, 'max_iter': 6364, 'max_leaf_nodes': 101, 'min_samples_leaf': 235, 'max_bins': 194, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [06:21<01:25, 17.12s/it]

[I 2023-10-16 14:00:01,494] Trial 24 finished with value: 0.7860065357704299 and parameters: {'random_state': 42, 'learning_rate': 0.01873138499382575, 'l2_regularization': 74, 'max_iter': 4718, 'max_leaf_nodes': 7, 'min_samples_leaf': 296, 'max_bins': 120, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [06:36<01:05, 16.34s/it]

[I 2023-10-16 14:00:16,011] Trial 25 finished with value: 0.7842586712261678 and parameters: {'random_state': 42, 'learning_rate': 0.019702633025966133, 'l2_regularization': 76, 'max_iter': 4628, 'max_leaf_nodes': 212, 'min_samples_leaf': 180, 'max_bins': 115, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [06:56<00:51, 17.29s/it]

[I 2023-10-16 14:00:35,511] Trial 26 finished with value: 0.7849238467870311 and parameters: {'random_state': 42, 'learning_rate': 0.012779437391769458, 'l2_regularization': 48, 'max_iter': 5124, 'max_leaf_nodes': 99, 'min_samples_leaf': 244, 'max_bins': 67, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [07:07<00:31, 15.65s/it]

[I 2023-10-16 14:00:47,337] Trial 27 finished with value: 0.7856418820411478 and parameters: {'random_state': 42, 'learning_rate': 0.015220913868352361, 'l2_regularization': 74, 'max_iter': 3761, 'max_leaf_nodes': 43, 'min_samples_leaf': 203, 'max_bins': 136, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [07:11<00:11, 11.94s/it]

[I 2023-10-16 14:00:50,607] Trial 28 finished with value: 0.7835347804760723 and parameters: {'random_state': 42, 'learning_rate': 0.0997258224203673, 'l2_regularization': 33, 'max_iter': 2907, 'max_leaf_nodes': 139, 'min_samples_leaf': 275, 'max_bins': 150, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.


Best trial: 11. Best value: 0.786026: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [07:22<00:00, 14.75s/it]

[I 2023-10-16 14:01:02,030] Trial 29 finished with value: 0.7845509830320572 and parameters: {'random_state': 42, 'learning_rate': 0.02211221496420053, 'l2_regularization': 64, 'max_iter': 8524, 'max_leaf_nodes': 226, 'min_samples_leaf': 270, 'max_bins': 125, 'class_weight': 'balanced'}. Best is trial 11 with value: 0.7860260421824963.





In [7]:
study.best_params, study.best_value

({'random_state': 42,
  'learning_rate': 0.010212498830869023,
  'l2_regularization': 53,
  'max_iter': 5167,
  'max_leaf_nodes': 24,
  'min_samples_leaf': 300,
  'max_bins': 186,
  'class_weight': 'balanced'},
 0.7860260421824963)

In [8]:
pipeline = make_pipeline(partial_pipeline, HistGradientBoostingClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)