In [1]:
import optuna
import os
from pathlib import Path
import kaggle
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 5
OUTPUT = f'cat_boost_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(ColumnTransformer([('drop', 'drop',
                                  ['iv(g)', 't', 'b', 'n',
                                   'lOCode', 'v', 'branchCount',
                                   'e', 'i', 'lOComment'])],
                                remainder='passthrough'), SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators',50,1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.7,log=True),
        'depth': trial.suggest_int('depth',1, 12, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',1e-8, 100,log=True),
        'model_size_reg': trial.suggest_float('model_size_reg',1e-8,100,log=True),
        'random_strength': trial.suggest_float('random_strength',1e-8,10,log=True),
        'subsample': trial.suggest_float("subsample", 0.1, 1),
        'verbose': trial.suggest_categorical('verbose', [False])
    }
    pipeline = make_pipeline(partial_pipeline, CatBoostClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-15 23:48:43,218] A new study created in memory with name: no-name-38360b1d-8643-4f5c-a844-bcedf12d92f3
Best trial: 0. Best value: 0.785131:   3%|█████▍                                                                                                                                                              | 1/30 [00:09<04:49,  9.98s/it]

[I 2023-10-15 23:48:53,198] Trial 0 finished with value: 0.7851305010976195 and parameters: {'n_estimators': 84, 'learning_rate': 0.036288549427801804, 'depth': 9, 'l2_leaf_reg': 0.0006792275201233925, 'model_size_reg': 0.00038303754039768024, 'random_strength': 2.304579022122091e-08, 'subsample': 0.49865001146905263, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:   7%|██████████▉                                                                                                                                                         | 2/30 [00:41<10:36, 22.72s/it]

[I 2023-10-15 23:49:24,830] Trial 1 finished with value: 0.7839258705247862 and parameters: {'n_estimators': 477, 'learning_rate': 0.16390018990311508, 'depth': 4, 'l2_leaf_reg': 2.4050997923277563e-08, 'model_size_reg': 3.507379710004656, 'random_strength': 8.196811352009398e-06, 'subsample': 0.1354472230353554, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  10%|████████████████▍                                                                                                                                                   | 3/30 [00:53<08:02, 17.86s/it]

[I 2023-10-15 23:49:36,901] Trial 2 finished with value: 0.7801781539254501 and parameters: {'n_estimators': 208, 'learning_rate': 0.0046016976005375946, 'depth': 2, 'l2_leaf_reg': 0.0013698377987390743, 'model_size_reg': 0.029607593148866556, 'random_strength': 8.889948334772857e-07, 'subsample': 0.4192462394509727, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  13%|█████████████████████▊                                                                                                                                              | 4/30 [01:24<09:55, 22.89s/it]

[I 2023-10-15 23:50:07,501] Trial 3 finished with value: 0.7405508082306629 and parameters: {'n_estimators': 529, 'learning_rate': 0.0004029521264193143, 'depth': 1, 'l2_leaf_reg': 0.023002552598998027, 'model_size_reg': 0.12188026033634645, 'random_strength': 1.8761145424150982e-08, 'subsample': 0.2646685488672368, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [01:45<09:17, 22.29s/it]

[I 2023-10-15 23:50:28,741] Trial 4 finished with value: 0.7847435239607319 and parameters: {'n_estimators': 314, 'learning_rate': 0.0526639766799753, 'depth': 2, 'l2_leaf_reg': 0.5743703225104324, 'model_size_reg': 6.995434215331852e-07, 'random_strength': 1.36241635788834e-07, 'subsample': 0.7894888316200115, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [02:23<11:00, 27.54s/it]

[I 2023-10-15 23:51:06,468] Trial 5 finished with value: 0.7847041124727264 and parameters: {'n_estimators': 498, 'learning_rate': 0.310086519788163, 'depth': 3, 'l2_leaf_reg': 41.20069090843206, 'model_size_reg': 47.54907857216679, 'random_strength': 1.5046509628547675e-07, 'subsample': 0.2038352931074583, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [02:28<07:42, 20.12s/it]

[I 2023-10-15 23:51:11,311] Trial 6 finished with value: 0.7110282882858477 and parameters: {'n_estimators': 146, 'learning_rate': 0.00022821114451521745, 'depth': 1, 'l2_leaf_reg': 0.001256174761703252, 'model_size_reg': 0.004785056165556147, 'random_strength': 1.749768233979403e-05, 'subsample': 0.9831813306758427, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [02:36<06:00, 16.39s/it]

[I 2023-10-15 23:51:19,707] Trial 7 finished with value: 0.7829509716017516 and parameters: {'n_estimators': 126, 'learning_rate': 0.011476599078706172, 'depth': 5, 'l2_leaf_reg': 3.6994523364715315e-07, 'model_size_reg': 7.82231062549053e-07, 'random_strength': 3.599063138625654e-07, 'subsample': 0.8085863244452605, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [02:42<04:37, 13.24s/it]

[I 2023-10-15 23:51:26,011] Trial 8 finished with value: 0.7721031123837594 and parameters: {'n_estimators': 151, 'learning_rate': 0.02670314706111102, 'depth': 1, 'l2_leaf_reg': 0.0759779088979782, 'model_size_reg': 9.789997358885623e-06, 'random_strength': 9.888127594395646, 'subsample': 0.9751980967774301, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [02:59<04:48, 14.42s/it]

[I 2023-10-15 23:51:43,100] Trial 9 finished with value: 0.7824995675990771 and parameters: {'n_estimators': 288, 'learning_rate': 0.011051499206290916, 'depth': 2, 'l2_leaf_reg': 0.0314060662729639, 'model_size_reg': 0.014400031783570523, 'random_strength': 2.166957616904918e-07, 'subsample': 0.2642866522772628, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [03:05<03:41, 11.66s/it]

[I 2023-10-15 23:51:48,489] Trial 10 finished with value: 0.770899207542094 and parameters: {'n_estimators': 55, 'learning_rate': 0.6903770413725362, 'depth': 8, 'l2_leaf_reg': 8.398047477141511e-06, 'model_size_reg': 1.2996175203409115e-08, 'random_strength': 0.0017212286405919013, 'subsample': 0.5532402627289247, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 12/30 [04:53<12:17, 40.95s/it]

[I 2023-10-15 23:53:36,446] Trial 11 finished with value: 0.7834563028737707 and parameters: {'n_estimators': 933, 'learning_rate': 0.06286748964450488, 'depth': 7, 'l2_leaf_reg': 6.262194094662259, 'model_size_reg': 6.476740426019116e-05, 'random_strength': 0.00013286776405941453, 'subsample': 0.6317244418447733, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  43%|██████████████████████████████████████████████████████████████████████▋                                                                                            | 13/30 [05:09<09:26, 33.34s/it]

[I 2023-10-15 23:53:52,259] Trial 12 finished with value: 0.7848279877030858 and parameters: {'n_estimators': 81, 'learning_rate': 0.05662095059135318, 'depth': 11, 'l2_leaf_reg': 3.588726299645167e-05, 'model_size_reg': 0.0002188882128962836, 'random_strength': 1.184734900182265e-08, 'subsample': 0.697975844933284, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 14/30 [05:26<07:38, 28.64s/it]

[I 2023-10-15 23:54:10,036] Trial 13 finished with value: 0.7832129208014368 and parameters: {'n_estimators': 68, 'learning_rate': 0.002737983357317305, 'depth': 12, 'l2_leaf_reg': 2.4328413195685476e-05, 'model_size_reg': 0.0005830479363821902, 'random_strength': 1.007933111683627e-08, 'subsample': 0.44002041508395345, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  50%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 15/30 [05:50<06:47, 27.20s/it]

[I 2023-10-15 23:54:33,903] Trial 14 finished with value: 0.7842639970885406 and parameters: {'n_estimators': 85, 'learning_rate': 0.0761104610944574, 'depth': 12, 'l2_leaf_reg': 3.3323540324552535e-05, 'model_size_reg': 0.0002751057521142703, 'random_strength': 1.0182589095462386e-08, 'subsample': 0.653296138078697, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  53%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 16/30 [05:58<04:59, 21.39s/it]

[I 2023-10-15 23:54:41,787] Trial 15 finished with value: 0.784906113579551 and parameters: {'n_estimators': 93, 'learning_rate': 0.025132142215284835, 'depth': 8, 'l2_leaf_reg': 0.0002314184463882641, 'model_size_reg': 0.0005542181524415887, 'random_strength': 1.1688312577305426e-05, 'subsample': 0.4538914155389415, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  57%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 17/30 [06:06<03:46, 17.42s/it]

[I 2023-10-15 23:54:49,977] Trial 16 finished with value: 0.7821519896916638 and parameters: {'n_estimators': 100, 'learning_rate': 0.0018579852937830566, 'depth': 7, 'l2_leaf_reg': 0.0009818366409998637, 'model_size_reg': 0.0021753346816252906, 'random_strength': 3.7739901951621936e-06, 'subsample': 0.49889465500857666, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 18/30 [06:11<02:41, 13.49s/it]

[I 2023-10-15 23:54:54,306] Trial 17 finished with value: 0.7825690866963273 and parameters: {'n_estimators': 55, 'learning_rate': 0.022216624201993097, 'depth': 5, 'l2_leaf_reg': 0.0003590514226131483, 'model_size_reg': 0.13254053959463274, 'random_strength': 0.00018591628910927068, 'subsample': 0.37716907978784026, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 0. Best value: 0.785131:  63%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 19/30 [06:24<02:29, 13.59s/it]

[I 2023-10-15 23:55:08,128] Trial 18 finished with value: 0.7843010105624311 and parameters: {'n_estimators': 109, 'learning_rate': 0.006515185151943195, 'depth': 9, 'l2_leaf_reg': 3.9073813546542764e-07, 'model_size_reg': 2.1694157051379455e-05, 'random_strength': 0.002334391952511371, 'subsample': 0.3431927410086374, 'verbose': False}. Best is trial 0 with value: 0.7851305010976195.


Best trial: 19. Best value: 0.785628:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [06:30<01:51, 11.12s/it]

[I 2023-10-15 23:55:13,514] Trial 19 finished with value: 0.7856276957070587 and parameters: {'n_estimators': 70, 'learning_rate': 0.14774846948936698, 'depth': 6, 'l2_leaf_reg': 0.006540458144923171, 'model_size_reg': 0.0029939513433718964, 'random_strength': 1.7259320203488801e-06, 'subsample': 0.572515814380104, 'verbose': False}. Best is trial 19 with value: 0.7856276957070587.


Best trial: 19. Best value: 0.785628:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [06:35<01:22,  9.22s/it]

[I 2023-10-15 23:55:18,290] Trial 20 finished with value: 0.7853063819611772 and parameters: {'n_estimators': 50, 'learning_rate': 0.19594568001299062, 'depth': 6, 'l2_leaf_reg': 0.006127596271131124, 'model_size_reg': 0.4596098875169447, 'random_strength': 1.3682677662222717e-06, 'subsample': 0.5490442779759255, 'verbose': False}. Best is trial 19 with value: 0.7856276957070587.


Best trial: 19. Best value: 0.785628:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [06:39<01:03,  7.93s/it]

[I 2023-10-15 23:55:23,216] Trial 21 finished with value: 0.785532979476041 and parameters: {'n_estimators': 52, 'learning_rate': 0.17051294862863337, 'depth': 6, 'l2_leaf_reg': 0.011960903416528064, 'model_size_reg': 0.37306005914533563, 'random_strength': 1.507259046129325e-06, 'subsample': 0.5546064804770396, 'verbose': False}. Best is trial 19 with value: 0.7856276957070587.


Best trial: 19. Best value: 0.785628:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [06:44<00:48,  6.86s/it]

[I 2023-10-15 23:55:27,582] Trial 22 finished with value: 0.785450785969059 and parameters: {'n_estimators': 51, 'learning_rate': 0.16187159702599033, 'depth': 5, 'l2_leaf_reg': 0.00822123279646648, 'model_size_reg': 1.0671267100234985, 'random_strength': 1.559700052292833e-06, 'subsample': 0.5667832790074944, 'verbose': False}. Best is trial 19 with value: 0.7856276957070587.


Best trial: 19. Best value: 0.785628:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [06:48<00:36,  6.03s/it]

[I 2023-10-15 23:55:31,681] Trial 23 finished with value: 0.7838973655589232 and parameters: {'n_estimators': 63, 'learning_rate': 0.6556820008557377, 'depth': 4, 'l2_leaf_reg': 0.18674132621920428, 'model_size_reg': 1.3461833686458622, 'random_strength': 1.075485302066094e-06, 'subsample': 0.5846489264729905, 'verbose': False}. Best is trial 19 with value: 0.7856276957070587.


Best trial: 24. Best value: 0.785657:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [06:54<00:30,  6.08s/it]

[I 2023-10-15 23:55:37,877] Trial 24 finished with value: 0.7856567284821686 and parameters: {'n_estimators': 68, 'learning_rate': 0.12029161791941848, 'depth': 6, 'l2_leaf_reg': 0.009418069893643286, 'model_size_reg': 7.007768945351371, 'random_strength': 1.8341119786271264e-05, 'subsample': 0.6007074418762239, 'verbose': False}. Best is trial 24 with value: 0.7856567284821686.


Best trial: 24. Best value: 0.785657:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [07:00<00:23,  5.88s/it]

[I 2023-10-15 23:55:43,281] Trial 25 finished with value: 0.7855234194825106 and parameters: {'n_estimators': 68, 'learning_rate': 0.09389435723096855, 'depth': 6, 'l2_leaf_reg': 0.18767051676941876, 'model_size_reg': 12.737070673000222, 'random_strength': 5.341024289278339e-05, 'subsample': 0.7054720984561461, 'verbose': False}. Best is trial 24 with value: 0.7856567284821686.


Best trial: 24. Best value: 0.785657:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [07:04<00:16,  5.44s/it]

[I 2023-10-15 23:55:47,687] Trial 26 finished with value: 0.7855281148460351 and parameters: {'n_estimators': 70, 'learning_rate': 0.3660498550546002, 'depth': 3, 'l2_leaf_reg': 0.6755502735563107, 'model_size_reg': 39.92534585253847, 'random_strength': 2.6611896783947394e-05, 'subsample': 0.6215799981415269, 'verbose': False}. Best is trial 24 with value: 0.7856567284821686.


Best trial: 24. Best value: 0.785657:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [07:22<00:18,  9.13s/it]

[I 2023-10-15 23:56:05,431] Trial 27 finished with value: 0.7837514698844639 and parameters: {'n_estimators': 109, 'learning_rate': 0.11670413273149334, 'depth': 10, 'l2_leaf_reg': 0.005763545630530416, 'model_size_reg': 7.503852286957007, 'random_strength': 6.6178244666951625e-06, 'subsample': 0.5001954604852553, 'verbose': False}. Best is trial 24 with value: 0.7856567284821686.


Best trial: 24. Best value: 0.785657:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [07:30<00:08,  8.87s/it]

[I 2023-10-15 23:56:13,687] Trial 28 finished with value: 0.7841105693542609 and parameters: {'n_estimators': 75, 'learning_rate': 0.2764030876496122, 'depth': 7, 'l2_leaf_reg': 0.038416031515949446, 'model_size_reg': 0.2663932413213687, 'random_strength': 4.361494541266555e-05, 'subsample': 0.36979582940955247, 'verbose': False}. Best is trial 24 with value: 0.7856567284821686.


Best trial: 24. Best value: 0.785657: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [07:41<00:00, 15.38s/it]

[I 2023-10-15 23:56:24,562] Trial 29 finished with value: 0.7852399194673245 and parameters: {'n_estimators': 84, 'learning_rate': 0.044896498538627594, 'depth': 9, 'l2_leaf_reg': 0.008603379022941794, 'model_size_reg': 93.79741835622181, 'random_strength': 6.997844407359394e-08, 'subsample': 0.48105081919088627, 'verbose': False}. Best is trial 24 with value: 0.7856567284821686.





In [7]:
study.best_params, study.best_value

({'n_estimators': 68,
  'learning_rate': 0.12029161791941848,
  'depth': 6,
  'l2_leaf_reg': 0.009418069893643286,
  'model_size_reg': 7.007768945351371,
  'random_strength': 1.8341119786271264e-05,
  'subsample': 0.6007074418762239,
  'verbose': False},
 0.7856567284821686)

In [8]:
pipeline = make_pipeline(partial_pipeline, CatBoostClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)