In [1]:
import optuna
import os
from pathlib import Path
import kaggle
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 4
OUTPUT = f'cat_boost_clf_submisson_v{VERSION}.csv'

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators',50,1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.7,log=True),
        'depth': trial.suggest_int('depth',1, 12, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',1e-8, 100,log=True),
        'model_size_reg': trial.suggest_float('model_size_reg',1e-8,100,log=True),
        'random_strength': trial.suggest_float('random_strength',1e-8,10,log=True),
        'subsample': trial.suggest_float("subsample", 0.1, 1),
        'verbose': trial.suggest_categorical('verbose', [False])
    }
    pipeline = make_pipeline(partial_pipeline, CatBoostClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2023-10-15 23:17:09,272] A new study created in memory with name: no-name-91ff6aea-0804-4529-b247-fbbb561e4f06
Best trial: 0. Best value: 0.782675:   3%|█████▍                                                                                                                                                              | 1/30 [00:09<04:23,  9.08s/it]

[I 2023-10-15 23:17:18,350] Trial 0 finished with value: 0.7826748781858139 and parameters: {'n_estimators': 111, 'learning_rate': 0.5136924487289304, 'depth': 1, 'l2_leaf_reg': 1.0257324702859163e-05, 'model_size_reg': 10.513849125410047, 'random_strength': 0.02160733546109541, 'subsample': 0.5976251147682442, 'verbose': False}. Best is trial 0 with value: 0.7826748781858139.


Best trial: 1. Best value: 0.785069:   7%|██████████▉                                                                                                                                                         | 2/30 [00:57<14:57, 32.07s/it]

[I 2023-10-15 23:18:06,515] Trial 1 finished with value: 0.7850685132464286 and parameters: {'n_estimators': 501, 'learning_rate': 0.2164311228403368, 'depth': 4, 'l2_leaf_reg': 38.20788660408033, 'model_size_reg': 0.04553418773588748, 'random_strength': 1.6928907527691608e-06, 'subsample': 0.5226838280148083, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  10%|████████████████▍                                                                                                                                                   | 3/30 [01:44<17:37, 39.17s/it]

[I 2023-10-15 23:18:54,134] Trial 2 finished with value: 0.7747643749249293 and parameters: {'n_estimators': 130, 'learning_rate': 0.006883442237938239, 'depth': 11, 'l2_leaf_reg': 0.007149463588596692, 'model_size_reg': 4.548767063195552e-06, 'random_strength': 5.594151448674494, 'subsample': 0.21377285547296232, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  13%|█████████████████████▊                                                                                                                                              | 4/30 [01:59<12:45, 29.44s/it]

[I 2023-10-15 23:19:08,665] Trial 3 finished with value: 0.7831038899084162 and parameters: {'n_estimators': 120, 'learning_rate': 0.0067201011818123465, 'depth': 6, 'l2_leaf_reg': 9.253727631163342, 'model_size_reg': 2.908076620675221e-05, 'random_strength': 0.0018481761045782928, 'subsample': 0.44417620610461583, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [02:24<11:35, 27.83s/it]

[I 2023-10-15 23:19:33,641] Trial 4 finished with value: 0.7827267932426992 and parameters: {'n_estimators': 110, 'learning_rate': 0.0002634900968114012, 'depth': 9, 'l2_leaf_reg': 0.010003902461352021, 'model_size_reg': 7.856845207184245e-08, 'random_strength': 3.891296245071441e-06, 'subsample': 0.297526192410912, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [02:44<10:03, 25.16s/it]

[I 2023-10-15 23:19:53,614] Trial 5 finished with value: 0.7806116863943513 and parameters: {'n_estimators': 208, 'learning_rate': 0.0015544406568206925, 'depth': 4, 'l2_leaf_reg': 33.80548554470423, 'model_size_reg': 1.4767479968644122e-06, 'random_strength': 1.2407544214653031e-08, 'subsample': 0.728251022455768, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [02:54<07:44, 20.18s/it]

[I 2023-10-15 23:20:03,528] Trial 6 finished with value: 0.7785339051260929 and parameters: {'n_estimators': 89, 'learning_rate': 0.005779351011576047, 'depth': 3, 'l2_leaf_reg': 1.461415826894188e-07, 'model_size_reg': 6.434088514163684, 'random_strength': 0.4476019445902602, 'subsample': 0.9496622683218652, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [03:27<08:53, 24.24s/it]

[I 2023-10-15 23:20:36,475] Trial 7 finished with value: 0.7839409915912264 and parameters: {'n_estimators': 537, 'learning_rate': 0.017395564385807534, 'depth': 2, 'l2_leaf_reg': 0.4040596079724189, 'model_size_reg': 0.0006291222674180485, 'random_strength': 4.320116138468488e-05, 'subsample': 0.36579050396843493, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [04:06<10:04, 28.80s/it]

[I 2023-10-15 23:21:15,298] Trial 8 finished with value: 0.7752797979748183 and parameters: {'n_estimators': 185, 'learning_rate': 0.008841045052328954, 'depth': 9, 'l2_leaf_reg': 61.42375209931475, 'model_size_reg': 1.2364239242188677e-05, 'random_strength': 6.194710472751407, 'subsample': 0.5291607179180555, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [04:15<07:34, 22.71s/it]

[I 2023-10-15 23:21:24,382] Trial 9 finished with value: 0.783789683863916 and parameters: {'n_estimators': 87, 'learning_rate': 0.02606839041439923, 'depth': 4, 'l2_leaf_reg': 1.9542914533386442e-05, 'model_size_reg': 40.66589634680576, 'random_strength': 0.00010880160389598785, 'subsample': 0.31498267590453954, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 1. Best value: 0.785069:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [05:08<10:10, 32.12s/it]

[I 2023-10-15 23:22:17,818] Trial 10 finished with value: 0.7834813771911867 and parameters: {'n_estimators': 985, 'learning_rate': 0.6826715453750103, 'depth': 2, 'l2_leaf_reg': 0.4773839550541704, 'model_size_reg': 0.0604751801453532, 'random_strength': 6.175239342051588e-07, 'subsample': 0.10804569396604419, 'verbose': False}. Best is trial 1 with value: 0.7850685132464286.


Best trial: 11. Best value: 0.785103:  40%|████████████████████████████████████████████████████████████████▊                                                                                                 | 12/30 [05:40<09:37, 32.06s/it]

[I 2023-10-15 23:22:49,757] Trial 11 finished with value: 0.7851034545774858 and parameters: {'n_estimators': 477, 'learning_rate': 0.09612831569643877, 'depth': 2, 'l2_leaf_reg': 0.4855778203408849, 'model_size_reg': 0.0030434344048517995, 'random_strength': 2.659716443822017e-05, 'subsample': 0.4239681871985115, 'verbose': False}. Best is trial 11 with value: 0.7851034545774858.


Best trial: 11. Best value: 0.785103:  43%|██████████████████████████████████████████████████████████████████████▏                                                                                           | 13/30 [06:09<08:47, 31.00s/it]

[I 2023-10-15 23:23:18,317] Trial 12 finished with value: 0.783521821483428 and parameters: {'n_estimators': 442, 'learning_rate': 0.1307230584840618, 'depth': 1, 'l2_leaf_reg': 0.6552270934021713, 'model_size_reg': 0.009795013825310754, 'random_strength': 3.9464415788327095e-06, 'subsample': 0.5954079768723766, 'verbose': False}. Best is trial 11 with value: 0.7851034545774858.


Best trial: 11. Best value: 0.785103:  47%|███████████████████████████████████████████████████████████████████████████▌                                                                                      | 14/30 [06:14<06:11, 23.21s/it]

[I 2023-10-15 23:23:23,516] Trial 13 finished with value: 0.7829234600801459 and parameters: {'n_estimators': 52, 'learning_rate': 0.09676534185709669, 'depth': 2, 'l2_leaf_reg': 97.58286145316616, 'model_size_reg': 0.15569414401052525, 'random_strength': 0.001588197972833378, 'subsample': 0.4250842971637058, 'verbose': False}. Best is trial 11 with value: 0.7851034545774858.


Best trial: 14. Best value: 0.78549:  50%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 15/30 [06:47<06:35, 26.38s/it]

[I 2023-10-15 23:23:57,246] Trial 14 finished with value: 0.7854901098183135 and parameters: {'n_estimators': 374, 'learning_rate': 0.16208623915916606, 'depth': 3, 'l2_leaf_reg': 1.3992794058497722, 'model_size_reg': 0.0007910632286603792, 'random_strength': 1.0435351247286603e-07, 'subsample': 0.7287747224556846, 'verbose': False}. Best is trial 14 with value: 0.7854901098183135.


Best trial: 14. Best value: 0.78549:  53%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 16/30 [07:08<05:46, 24.73s/it]

[I 2023-10-15 23:24:18,160] Trial 15 finished with value: 0.7820498820350361 and parameters: {'n_estimators': 307, 'learning_rate': 0.060485122316579536, 'depth': 1, 'l2_leaf_reg': 0.036797860324877696, 'model_size_reg': 0.000689770286658965, 'random_strength': 3.145810225337308e-08, 'subsample': 0.7283498043153898, 'verbose': False}. Best is trial 14 with value: 0.7854901098183135.


Best trial: 14. Best value: 0.78549:  57%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 17/30 [07:35<05:29, 25.38s/it]

[I 2023-10-15 23:24:45,027] Trial 16 finished with value: 0.7848081225124777 and parameters: {'n_estimators': 318, 'learning_rate': 0.04870675201162072, 'depth': 2, 'l2_leaf_reg': 1.9681791778262596, 'model_size_reg': 0.00032548508248135744, 'random_strength': 1.1465996242194638e-07, 'subsample': 0.7086850973891773, 'verbose': False}. Best is trial 14 with value: 0.7854901098183135.


Best trial: 14. Best value: 0.78549:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 18/30 [08:32<06:58, 34.84s/it]

[I 2023-10-15 23:25:41,883] Trial 17 finished with value: 0.7840978089539382 and parameters: {'n_estimators': 710, 'learning_rate': 0.15833765662029883, 'depth': 3, 'l2_leaf_reg': 0.04483901286225518, 'model_size_reg': 0.005530856255913702, 'random_strength': 1.6274313799617203e-07, 'subsample': 0.8768093884595843, 'verbose': False}. Best is trial 14 with value: 0.7854901098183135.


Best trial: 14. Best value: 0.78549:  63%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 19/30 [09:06<06:19, 34.47s/it]

[I 2023-10-15 23:26:15,511] Trial 18 finished with value: 0.7679270949579905 and parameters: {'n_estimators': 312, 'learning_rate': 0.34642123273198316, 'depth': 5, 'l2_leaf_reg': 0.0009311296604800526, 'model_size_reg': 0.00017484035207746183, 'random_strength': 2.0003687266588266e-05, 'subsample': 0.8237347783680155, 'verbose': False}. Best is trial 14 with value: 0.7854901098183135.


Best trial: 14. Best value: 0.78549:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 20/30 [09:57<06:34, 39.40s/it]

[I 2023-10-15 23:27:06,408] Trial 19 finished with value: 0.7811643087109277 and parameters: {'n_estimators': 418, 'learning_rate': 0.17693355637855585, 'depth': 6, 'l2_leaf_reg': 3.8417355013157413, 'model_size_reg': 1.1594191536821805, 'random_strength': 1.3203394987059696e-05, 'subsample': 0.9955426609810797, 'verbose': False}. Best is trial 14 with value: 0.7854901098183135.


Best trial: 20. Best value: 0.785864:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [10:46<06:21, 42.38s/it]

[I 2023-10-15 23:27:55,719] Trial 20 finished with value: 0.7858636509474153 and parameters: {'n_estimators': 647, 'learning_rate': 0.062126136754221545, 'depth': 3, 'l2_leaf_reg': 0.1675463052495161, 'model_size_reg': 0.003576422478697353, 'random_strength': 4.2456958775047355e-07, 'subsample': 0.6625232716854229, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [11:38<06:02, 45.37s/it]

[I 2023-10-15 23:28:48,057] Trial 21 finished with value: 0.785812250445509 and parameters: {'n_estimators': 676, 'learning_rate': 0.05812344624790334, 'depth': 3, 'l2_leaf_reg': 0.1502395899267988, 'model_size_reg': 0.005369488182151018, 'random_strength': 4.221152602785005e-07, 'subsample': 0.6535383849828311, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [12:35<05:41, 48.82s/it]

[I 2023-10-15 23:29:44,946] Trial 22 finished with value: 0.7858243092461651 and parameters: {'n_estimators': 743, 'learning_rate': 0.051115117788009486, 'depth': 3, 'l2_leaf_reg': 0.10005121465572954, 'model_size_reg': 0.012258713863063308, 'random_strength': 3.0881614934981724e-07, 'subsample': 0.6708594943429566, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [13:36<05:14, 52.34s/it]

[I 2023-10-15 23:30:45,496] Trial 23 finished with value: 0.7857194222943943 and parameters: {'n_estimators': 786, 'learning_rate': 0.03129805985812341, 'depth': 3, 'l2_leaf_reg': 0.09897637329068587, 'model_size_reg': 0.33508844808251115, 'random_strength': 8.386450639726086e-07, 'subsample': 0.647420290999925, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [14:32<04:27, 53.53s/it]

[I 2023-10-15 23:31:41,780] Trial 24 finished with value: 0.78561152286463 and parameters: {'n_estimators': 635, 'learning_rate': 0.05181577317020315, 'depth': 4, 'l2_leaf_reg': 0.0021688782464445748, 'model_size_reg': 0.014732472184980682, 'random_strength': 4.007342355974252e-07, 'subsample': 0.649597060641555, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [16:31<04:52, 73.14s/it]

[I 2023-10-15 23:33:40,686] Trial 25 finished with value: 0.7854577893812749 and parameters: {'n_estimators': 963, 'learning_rate': 0.019232709773457157, 'depth': 6, 'l2_leaf_reg': 0.09007774490683974, 'model_size_reg': 0.018545702744389787, 'random_strength': 1.3585482793368186e-08, 'subsample': 0.7974168410054703, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [17:26<03:23, 67.75s/it]

[I 2023-10-15 23:34:35,851] Trial 26 finished with value: 0.78376063209038 and parameters: {'n_estimators': 629, 'learning_rate': 0.31293160122712205, 'depth': 3, 'l2_leaf_reg': 11.672261171965442, 'model_size_reg': 0.0029972224735246447, 'random_strength': 4.4893829867850975e-06, 'subsample': 0.6683659670755149, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [18:19<02:06, 63.22s/it]

[I 2023-10-15 23:35:28,500] Trial 27 finished with value: 0.7856555641870002 and parameters: {'n_estimators': 781, 'learning_rate': 0.07259319675088752, 'depth': 2, 'l2_leaf_reg': 0.15639409631405649, 'model_size_reg': 0.5332078082922016, 'random_strength': 7.922614396581367e-08, 'subsample': 0.5779107057140932, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [19:42<01:09, 69.38s/it]

[I 2023-10-15 23:36:52,243] Trial 28 finished with value: 0.7832165695027731 and parameters: {'n_estimators': 558, 'learning_rate': 0.04299399883266898, 'depth': 7, 'l2_leaf_reg': 0.017687755506732163, 'model_size_reg': 0.12615824538073864, 'random_strength': 3.66917851855476e-07, 'subsample': 0.7724669134773252, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.


Best trial: 20. Best value: 0.785864: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [21:20<00:00, 42.67s/it]

[I 2023-10-15 23:38:29,294] Trial 29 finished with value: 0.7455749992195538 and parameters: {'n_estimators': 878, 'learning_rate': 0.4145251922299088, 'depth': 5, 'l2_leaf_reg': 0.0019277271850693459, 'model_size_reg': 8.459567828062162e-05, 'random_strength': 1.4311978698091337e-06, 'subsample': 0.6297096366163148, 'verbose': False}. Best is trial 20 with value: 0.7858636509474153.





In [7]:
study.best_params, study.best_value

({'n_estimators': 647,
  'learning_rate': 0.062126136754221545,
  'depth': 3,
  'l2_leaf_reg': 0.1675463052495161,
  'model_size_reg': 0.003576422478697353,
  'random_strength': 4.2456958775047355e-07,
  'subsample': 0.6625232716854229,
  'verbose': False},
 0.7858636509474153)

In [8]:
pipeline = make_pipeline(partial_pipeline, CatBoostClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)