In [1]:
import optuna
import os
from pathlib import Path
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingClassifier


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 3
OUTPUT = f'hist_grad_boost_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(ColumnTransformer([('drop', 'drop',
                                  ['iv(g)', 't', 'b', 'n',
                                   'lOCode', 'v', 'branchCount',
                                   'e', 'i', 'lOComment'])],
                                remainder='passthrough'), SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
def objective(trial):
    params = {
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_regularization': trial.suggest_int('l2_regularization', 0, 100),
        'max_iter': trial.suggest_int('max_iter', 10, 10_000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 500),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 300),
        'max_bins': trial.suggest_int('max_bins', 32, 255),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
    }
    pipeline = make_pipeline(partial_pipeline, HistGradientBoostingClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-16 14:08:38,850] A new study created in memory with name: no-name-ac063050-4fee-4d61-8104-c94df3e0a139
Best trial: 0. Best value: 0.783083:   3%|█████▍                                                                                                                                                              | 1/30 [00:14<07:13, 14.96s/it]

[I 2023-10-16 14:08:53,811] Trial 0 finished with value: 0.7830826929961583 and parameters: {'random_state': 42, 'learning_rate': 0.016147244964703066, 'l2_regularization': 31, 'max_iter': 579, 'max_leaf_nodes': 265, 'min_samples_leaf': 33, 'max_bins': 113, 'class_weight': None}. Best is trial 0 with value: 0.7830826929961583.


Best trial: 0. Best value: 0.783083:   7%|██████████▉                                                                                                                                                         | 2/30 [00:20<04:27,  9.57s/it]

[I 2023-10-16 14:08:59,600] Trial 1 finished with value: 0.7822329003357007 and parameters: {'random_state': 42, 'learning_rate': 0.04281011805863016, 'l2_regularization': 4, 'max_iter': 6086, 'max_leaf_nodes': 480, 'min_samples_leaf': 258, 'max_bins': 41, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7830826929961583.


Best trial: 2. Best value: 0.783769:  10%|████████████████▍                                                                                                                                                   | 3/30 [00:25<03:14,  7.19s/it]

[I 2023-10-16 14:09:03,973] Trial 2 finished with value: 0.7837687555006392 and parameters: {'random_state': 42, 'learning_rate': 0.08197306963613438, 'l2_regularization': 76, 'max_iter': 4263, 'max_leaf_nodes': 166, 'min_samples_leaf': 107, 'max_bins': 36, 'class_weight': None}. Best is trial 2 with value: 0.7837687555006392.


Best trial: 3. Best value: 0.784208:  13%|█████████████████████▊                                                                                                                                              | 4/30 [00:35<03:36,  8.34s/it]

[I 2023-10-16 14:09:14,057] Trial 3 finished with value: 0.7842075849261605 and parameters: {'random_state': 42, 'learning_rate': 0.02025925609005974, 'l2_regularization': 59, 'max_iter': 317, 'max_leaf_nodes': 444, 'min_samples_leaf': 215, 'max_bins': 194, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.7842075849261605.


Best trial: 3. Best value: 0.784208:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [00:51<04:43, 11.34s/it]

[I 2023-10-16 14:09:30,724] Trial 4 finished with value: 0.7837589547486015 and parameters: {'random_state': 42, 'learning_rate': 0.0146572553146341, 'l2_regularization': 60, 'max_iter': 2008, 'max_leaf_nodes': 342, 'min_samples_leaf': 125, 'max_bins': 129, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.7842075849261605.


Best trial: 5. Best value: 0.784939:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [00:58<03:57,  9.90s/it]

[I 2023-10-16 14:09:37,820] Trial 5 finished with value: 0.7849387035165919 and parameters: {'random_state': 42, 'learning_rate': 0.030199814616538284, 'l2_regularization': 70, 'max_iter': 8054, 'max_leaf_nodes': 79, 'min_samples_leaf': 283, 'max_bins': 222, 'class_weight': 'balanced'}. Best is trial 5 with value: 0.7849387035165919.


Best trial: 6. Best value: 0.785178:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [01:03<03:10,  8.27s/it]

[I 2023-10-16 14:09:42,733] Trial 6 finished with value: 0.7851780332053174 and parameters: {'random_state': 42, 'learning_rate': 0.04344774328263869, 'l2_regularization': 49, 'max_iter': 5363, 'max_leaf_nodes': 58, 'min_samples_leaf': 100, 'max_bins': 124, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.7851780332053174.


Best trial: 6. Best value: 0.785178:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [01:12<03:06,  8.49s/it]

[I 2023-10-16 14:09:51,705] Trial 7 finished with value: 0.782910782034221 and parameters: {'random_state': 42, 'learning_rate': 0.02467850238690697, 'l2_regularization': 9, 'max_iter': 228, 'max_leaf_nodes': 168, 'min_samples_leaf': 213, 'max_bins': 209, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.7851780332053174.


Best trial: 6. Best value: 0.785178:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [01:23<03:10,  9.05s/it]

[I 2023-10-16 14:10:01,989] Trial 8 finished with value: 0.7848138599206038 and parameters: {'random_state': 42, 'learning_rate': 0.02173527560411739, 'l2_regularization': 89, 'max_iter': 4286, 'max_leaf_nodes': 86, 'min_samples_leaf': 85, 'max_bins': 176, 'class_weight': None}. Best is trial 6 with value: 0.7851780332053174.


Best trial: 6. Best value: 0.785178:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [01:27<02:29,  7.45s/it]

[I 2023-10-16 14:10:05,863] Trial 9 finished with value: 0.7836215898932698 and parameters: {'random_state': 42, 'learning_rate': 0.0986714215270935, 'l2_regularization': 83, 'max_iter': 7570, 'max_leaf_nodes': 260, 'min_samples_leaf': 151, 'max_bins': 191, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.7851780332053174.


Best trial: 10. Best value: 0.786146:  37%|███████████████████████████████████████████████████████████▍                                                                                                      | 11/30 [01:34<02:21,  7.45s/it]

[I 2023-10-16 14:10:13,316] Trial 10 finished with value: 0.7861463226378816 and parameters: {'random_state': 42, 'learning_rate': 0.04285446422853461, 'l2_regularization': 37, 'max_iter': 9726, 'max_leaf_nodes': 5, 'min_samples_leaf': 6, 'max_bins': 87, 'class_weight': None}. Best is trial 10 with value: 0.7861463226378816.


Best trial: 10. Best value: 0.786146:  40%|████████████████████████████████████████████████████████████████▊                                                                                                 | 12/30 [01:40<02:03,  6.88s/it]

[I 2023-10-16 14:10:18,896] Trial 11 finished with value: 0.7853721070385135 and parameters: {'random_state': 42, 'learning_rate': 0.04483191564322592, 'l2_regularization': 37, 'max_iter': 9601, 'max_leaf_nodes': 35, 'min_samples_leaf': 2, 'max_bins': 97, 'class_weight': None}. Best is trial 10 with value: 0.7861463226378816.


Best trial: 12. Best value: 0.786169:  43%|██████████████████████████████████████████████████████████████████████▏                                                                                           | 13/30 [01:46<01:53,  6.66s/it]

[I 2023-10-16 14:10:25,056] Trial 12 finished with value: 0.7861688418968245 and parameters: {'random_state': 42, 'learning_rate': 0.04914413927409843, 'l2_regularization': 33, 'max_iter': 9946, 'max_leaf_nodes': 9, 'min_samples_leaf': 13, 'max_bins': 80, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  47%|███████████████████████████████████████████████████████████████████████████▌                                                                                      | 14/30 [01:52<01:45,  6.61s/it]

[I 2023-10-16 14:10:31,541] Trial 13 finished with value: 0.7859998440078529 and parameters: {'random_state': 42, 'learning_rate': 0.06866880782978221, 'l2_regularization': 22, 'max_iter': 9885, 'max_leaf_nodes': 5, 'min_samples_leaf': 52, 'max_bins': 81, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  50%|█████████████████████████████████████████████████████████████████████████████████                                                                                 | 15/30 [02:08<02:20,  9.34s/it]

[I 2023-10-16 14:10:47,216] Trial 14 finished with value: 0.7841554270573171 and parameters: {'random_state': 42, 'learning_rate': 0.011354646333246846, 'l2_regularization': 41, 'max_iter': 8297, 'max_leaf_nodes': 154, 'min_samples_leaf': 6, 'max_bins': 70, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 16/30 [02:13<01:51,  7.93s/it]

[I 2023-10-16 14:10:51,861] Trial 15 finished with value: 0.784158211495075 and parameters: {'random_state': 42, 'learning_rate': 0.0546101253893469, 'l2_regularization': 20, 'max_iter': 6741, 'max_leaf_nodes': 124, 'min_samples_leaf': 56, 'max_bins': 251, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  57%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 17/30 [02:19<01:36,  7.45s/it]

[I 2023-10-16 14:10:58,210] Trial 16 finished with value: 0.7857861974665259 and parameters: {'random_state': 42, 'learning_rate': 0.032519161331951325, 'l2_regularization': 26, 'max_iter': 8994, 'max_leaf_nodes': 13, 'min_samples_leaf': 71, 'max_bins': 153, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [02:24<01:22,  6.88s/it]

[I 2023-10-16 14:11:03,753] Trial 17 finished with value: 0.7842931292969741 and parameters: {'random_state': 42, 'learning_rate': 0.06497031312293734, 'l2_regularization': 99, 'max_iter': 7011, 'max_leaf_nodes': 215, 'min_samples_leaf': 34, 'max_bins': 66, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [02:32<01:18,  7.12s/it]

[I 2023-10-16 14:11:11,430] Trial 18 finished with value: 0.7839475055767389 and parameters: {'random_state': 42, 'learning_rate': 0.03269066632968256, 'l2_regularization': 47, 'max_iter': 8856, 'max_leaf_nodes': 396, 'min_samples_leaf': 153, 'max_bins': 152, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [02:37<01:04,  6.42s/it]

[I 2023-10-16 14:11:16,220] Trial 19 finished with value: 0.7843543913554087 and parameters: {'random_state': 42, 'learning_rate': 0.05157087580732268, 'l2_regularization': 14, 'max_iter': 9944, 'max_leaf_nodes': 107, 'min_samples_leaf': 26, 'max_bins': 96, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [02:46<01:04,  7.15s/it]

[I 2023-10-16 14:11:25,065] Trial 20 finished with value: 0.7843215502350339 and parameters: {'random_state': 42, 'learning_rate': 0.03623564632336194, 'l2_regularization': 60, 'max_iter': 8913, 'max_leaf_nodes': 316, 'min_samples_leaf': 185, 'max_bins': 57, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [02:51<00:52,  6.54s/it]

[I 2023-10-16 14:11:30,203] Trial 21 finished with value: 0.7860666165473155 and parameters: {'random_state': 42, 'learning_rate': 0.06756232594901251, 'l2_regularization': 22, 'max_iter': 9674, 'max_leaf_nodes': 6, 'min_samples_leaf': 50, 'max_bins': 89, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [02:56<00:43,  6.21s/it]

[I 2023-10-16 14:11:35,639] Trial 22 finished with value: 0.7849399038320666 and parameters: {'random_state': 42, 'learning_rate': 0.05945296448930161, 'l2_regularization': 33, 'max_iter': 8057, 'max_leaf_nodes': 57, 'min_samples_leaf': 2, 'max_bins': 88, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [03:00<00:32,  5.48s/it]

[I 2023-10-16 14:11:39,404] Trial 23 finished with value: 0.7857603926263624 and parameters: {'random_state': 42, 'learning_rate': 0.07514843924623711, 'l2_regularization': 14, 'max_iter': 9025, 'max_leaf_nodes': 14, 'min_samples_leaf': 54, 'max_bins': 112, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [03:06<00:27,  5.51s/it]

[I 2023-10-16 14:11:44,975] Trial 24 finished with value: 0.7840497958590563 and parameters: {'random_state': 42, 'learning_rate': 0.055725539157752274, 'l2_regularization': 42, 'max_iter': 2715, 'max_leaf_nodes': 129, 'min_samples_leaf': 34, 'max_bins': 55, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [03:10<00:20,  5.06s/it]

[I 2023-10-16 14:11:48,993] Trial 25 finished with value: 0.7850342898071353 and parameters: {'random_state': 42, 'learning_rate': 0.04831117836964243, 'l2_regularization': 1, 'max_iter': 7244, 'max_leaf_nodes': 51, 'min_samples_leaf': 78, 'max_bins': 136, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [03:14<00:14,  4.84s/it]

[I 2023-10-16 14:11:53,323] Trial 26 finished with value: 0.7847093999052448 and parameters: {'random_state': 42, 'learning_rate': 0.06310852467779486, 'l2_regularization': 27, 'max_iter': 9969, 'max_leaf_nodes': 85, 'min_samples_leaf': 21, 'max_bins': 80, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [03:22<00:11,  5.85s/it]

[I 2023-10-16 14:12:01,517] Trial 27 finished with value: 0.7837531806688487 and parameters: {'random_state': 42, 'learning_rate': 0.03869978488539974, 'l2_regularization': 54, 'max_iter': 6393, 'max_leaf_nodes': 215, 'min_samples_leaf': 47, 'max_bins': 104, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [03:27<00:05,  5.45s/it]

[I 2023-10-16 14:12:06,035] Trial 28 finished with value: 0.7850467591210757 and parameters: {'random_state': 42, 'learning_rate': 0.049921907572925125, 'l2_regularization': 19, 'max_iter': 8406, 'max_leaf_nodes': 40, 'min_samples_leaf': 117, 'max_bins': 48, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.


Best trial: 12. Best value: 0.786169: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [03:31<00:00,  7.06s/it]

[I 2023-10-16 14:12:10,569] Trial 29 finished with value: 0.783053790414532 and parameters: {'random_state': 42, 'learning_rate': 0.08287264443906442, 'l2_regularization': 33, 'max_iter': 9237, 'max_leaf_nodes': 197, 'min_samples_leaf': 22, 'max_bins': 116, 'class_weight': None}. Best is trial 12 with value: 0.7861688418968245.





In [7]:
study.best_params, study.best_value

({'random_state': 42,
  'learning_rate': 0.04914413927409843,
  'l2_regularization': 33,
  'max_iter': 9946,
  'max_leaf_nodes': 9,
  'min_samples_leaf': 13,
  'max_bins': 80,
  'class_weight': None},
 0.7861688418968245)

In [8]:
pipeline = make_pipeline(partial_pipeline, HistGradientBoostingClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)