In [1]:
import optuna
import os
from pathlib import Path
import kaggle
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 3
OUTPUT = f'cat_boost_clf_submisson_v{VERSION}.csv'

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), StandardScaler())

In [6]:
def objective(trial):
    params = {
        'task_type', 'GPU',
        'n_estimators': trial.suggest_int('n_estimators',50,1000, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'depth': trial.suggest_int('depth', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.05, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'verbose': trial.suggest_categorical('verbose', [False])
    }
    pipeline = make_pipeline(partial_pipeline, CatBoostClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2023-10-14 22:18:31,653] A new study created in memory with name: no-name-e9c16479-eeb4-4867-a977-d811a79abc0f
Best trial: 0. Best value: 0.775943:   3%|█████▍                                                                                                                                                              | 1/30 [00:29<14:05, 29.15s/it]

[I 2023-10-14 22:19:00,802] Trial 0 finished with value: 0.7759434200883465 and parameters: {'n_estimators': 260, 'learning_rate': 0.004006292778237605, 'depth': 9, 'subsample': 0.546649559330401, 'colsample_bylevel': 0.11451087804995956, 'min_data_in_leaf': 24, 'verbose': False}. Best is trial 0 with value: 0.7759434200883465.


Best trial: 1. Best value: 0.776006:   7%|██████████▉                                                                                                                                                         | 2/30 [00:36<07:32, 16.16s/it]

[I 2023-10-14 22:19:07,870] Trial 1 finished with value: 0.7760057238661118 and parameters: {'n_estimators': 51, 'learning_rate': 0.001813377714006975, 'depth': 7, 'subsample': 0.31639266729391397, 'colsample_bylevel': 0.664959651401588, 'min_data_in_leaf': 13, 'verbose': False}. Best is trial 1 with value: 0.7760057238661118.


Best trial: 1. Best value: 0.776006:  10%|████████████████▍                                                                                                                                                   | 3/30 [00:50<06:55, 15.40s/it]

[I 2023-10-14 22:19:22,366] Trial 2 finished with value: 0.7726020787723564 and parameters: {'n_estimators': 227, 'learning_rate': 0.0011103943596803224, 'depth': 3, 'subsample': 0.15314057631706912, 'colsample_bylevel': 0.21405408524785852, 'min_data_in_leaf': 2, 'verbose': False}. Best is trial 1 with value: 0.7760057238661118.


Best trial: 1. Best value: 0.776006:  13%|█████████████████████▊                                                                                                                                              | 4/30 [00:59<05:36, 12.94s/it]

[I 2023-10-14 22:19:31,544] Trial 3 finished with value: 0.7759577158579025 and parameters: {'n_estimators': 130, 'learning_rate': 0.0032495053670559475, 'depth': 2, 'subsample': 0.9359329946410249, 'colsample_bylevel': 0.7897789821883607, 'min_data_in_leaf': 77, 'verbose': False}. Best is trial 1 with value: 0.7760057238661118.


Best trial: 4. Best value: 0.784271:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [01:50<11:07, 26.69s/it]

[I 2023-10-14 22:20:22,603] Trial 4 finished with value: 0.7842708990434464 and parameters: {'n_estimators': 556, 'learning_rate': 0.08728480016636454, 'depth': 6, 'subsample': 0.2516232171944691, 'colsample_bylevel': 0.8627879514740929, 'min_data_in_leaf': 72, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [02:54<15:43, 39.31s/it]

[I 2023-10-14 22:21:26,429] Trial 5 finished with value: 0.7836725518422083 and parameters: {'n_estimators': 721, 'learning_rate': 0.02404715933331585, 'depth': 10, 'subsample': 0.5669429467987871, 'colsample_bylevel': 0.06968224911317338, 'min_data_in_leaf': 24, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [04:18<20:41, 53.99s/it]

[I 2023-10-14 22:22:50,625] Trial 6 finished with value: 0.7804487521400422 and parameters: {'n_estimators': 382, 'learning_rate': 0.08881014383718945, 'depth': 10, 'subsample': 0.8497977380101018, 'colsample_bylevel': 0.411642406125607, 'min_data_in_leaf': 64, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [05:00<18:17, 49.88s/it]

[I 2023-10-14 22:23:31,711] Trial 7 finished with value: 0.783916079036153 and parameters: {'n_estimators': 502, 'learning_rate': 0.04935544727196124, 'depth': 6, 'subsample': 0.10761781204276871, 'colsample_bylevel': 0.17579401690000662, 'min_data_in_leaf': 54, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [06:08<19:26, 55.57s/it]

[I 2023-10-14 22:24:39,777] Trial 8 finished with value: 0.7790619317443591 and parameters: {'n_estimators': 789, 'learning_rate': 0.0020227073802107187, 'depth': 4, 'subsample': 0.7516901053296549, 'colsample_bylevel': 0.3575341398962296, 'min_data_in_leaf': 58, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [06:16<13:37, 40.88s/it]

[I 2023-10-14 22:24:47,770] Trial 9 finished with value: 0.7692975373245956 and parameters: {'n_estimators': 66, 'learning_rate': 0.001508511438561552, 'depth': 9, 'subsample': 0.432569518336656, 'colsample_bylevel': 0.07805979426984036, 'min_data_in_leaf': 67, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [07:07<13:56, 44.02s/it]

[I 2023-10-14 22:25:38,926] Trial 10 finished with value: 0.7800221260059753 and parameters: {'n_estimators': 856, 'learning_rate': 0.010438680924309763, 'depth': 1, 'subsample': 0.2979123401659991, 'colsample_bylevel': 0.963270242943767, 'min_data_in_leaf': 93, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 12/30 [07:44<12:33, 41.86s/it]

[I 2023-10-14 22:26:15,833] Trial 11 finished with value: 0.7839874781727826 and parameters: {'n_estimators': 431, 'learning_rate': 0.09684661496346914, 'depth': 6, 'subsample': 0.0575576405227086, 'colsample_bylevel': 0.5615014686298876, 'min_data_in_leaf': 42, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  43%|██████████████████████████████████████████████████████████████████████▋                                                                                            | 13/30 [08:09<10:25, 36.78s/it]

[I 2023-10-14 22:26:40,918] Trial 12 finished with value: 0.7840237327007571 and parameters: {'n_estimators': 403, 'learning_rate': 0.09286362870857254, 'depth': 5, 'subsample': 0.05599723015797775, 'colsample_bylevel': 0.6011586906485207, 'min_data_in_leaf': 39, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 14/30 [08:28<08:25, 31.59s/it]

[I 2023-10-14 22:27:00,517] Trial 13 finished with value: 0.7839819269231562 and parameters: {'n_estimators': 324, 'learning_rate': 0.039849966590204434, 'depth': 4, 'subsample': 0.21770237413986193, 'colsample_bylevel': 0.8139647829676299, 'min_data_in_leaf': 41, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 4. Best value: 0.784271:  50%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 15/30 [09:14<08:55, 35.68s/it]

[I 2023-10-14 22:27:45,670] Trial 14 finished with value: 0.7837256150455901 and parameters: {'n_estimators': 538, 'learning_rate': 0.023517192781499142, 'depth': 7, 'subsample': 0.051140305911832495, 'colsample_bylevel': 0.969731227829037, 'min_data_in_leaf': 85, 'verbose': False}. Best is trial 4 with value: 0.7842708990434464.


Best trial: 15. Best value: 0.784646:  53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 16/30 [09:29<06:54, 29.61s/it]

[I 2023-10-14 22:28:01,196] Trial 15 finished with value: 0.7846463120186973 and parameters: {'n_estimators': 174, 'learning_rate': 0.09402515463936785, 'depth': 5, 'subsample': 0.21859329156448387, 'colsample_bylevel': 0.6492425267347925, 'min_data_in_leaf': 37, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  57%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 17/30 [09:51<05:54, 27.31s/it]

[I 2023-10-14 22:28:23,139] Trial 16 finished with value: 0.7843533128593162 and parameters: {'n_estimators': 187, 'learning_rate': 0.0520111724167494, 'depth': 7, 'subsample': 0.242190041184976, 'colsample_bylevel': 0.7167349516788148, 'min_data_in_leaf': 73, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [10:18<05:25, 27.15s/it]

[I 2023-10-14 22:28:49,924] Trial 17 finished with value: 0.784421612977931 and parameters: {'n_estimators': 158, 'learning_rate': 0.04622390406129897, 'depth': 8, 'subsample': 0.3869414510379515, 'colsample_bylevel': 0.6943619578833787, 'min_data_in_leaf': 100, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [10:41<04:46, 26.06s/it]

[I 2023-10-14 22:29:13,431] Trial 18 finished with value: 0.7823679094658905 and parameters: {'n_estimators': 147, 'learning_rate': 0.017834591967084448, 'depth': 8, 'subsample': 0.4425609740936192, 'colsample_bylevel': 0.6668116761598207, 'min_data_in_leaf': 97, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [10:49<03:26, 20.61s/it]

[I 2023-10-14 22:29:21,350] Trial 19 finished with value: 0.7830154648044241 and parameters: {'n_estimators': 105, 'learning_rate': 0.03613029543355549, 'depth': 5, 'subsample': 0.38028923609040394, 'colsample_bylevel': 0.4650891910222109, 'min_data_in_leaf': 29, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [11:05<02:53, 19.29s/it]

[I 2023-10-14 22:29:37,554] Trial 20 finished with value: 0.7841926089934026 and parameters: {'n_estimators': 93, 'learning_rate': 0.05938161496602078, 'depth': 8, 'subsample': 0.6408950115024992, 'colsample_bylevel': 0.5263300313128415, 'min_data_in_leaf': 49, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [11:25<02:35, 19.50s/it]

[I 2023-10-14 22:29:57,537] Trial 21 finished with value: 0.7840747332308282 and parameters: {'n_estimators': 170, 'learning_rate': 0.05632044116026806, 'depth': 7, 'subsample': 0.1998759756348396, 'colsample_bylevel': 0.7058708291946633, 'min_data_in_leaf': 84, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [11:59<02:45, 23.70s/it]

[I 2023-10-14 22:30:31,052] Trial 22 finished with value: 0.7840971060280075 and parameters: {'n_estimators': 205, 'learning_rate': 0.03279934059516806, 'depth': 8, 'subsample': 0.3173467760721964, 'colsample_bylevel': 0.7152993679602835, 'min_data_in_leaf': 89, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 15. Best value: 0.784646:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [12:12<02:02, 20.45s/it]

[I 2023-10-14 22:30:43,929] Trial 23 finished with value: 0.7837203243112352 and parameters: {'n_estimators': 184, 'learning_rate': 0.061241511255132025, 'depth': 4, 'subsample': 0.21868646224255572, 'colsample_bylevel': 0.60279828136666, 'min_data_in_leaf': 99, 'verbose': False}. Best is trial 15 with value: 0.7846463120186973.


Best trial: 24. Best value: 0.784908:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [12:39<01:52, 22.41s/it]

[I 2023-10-14 22:31:10,889] Trial 24 finished with value: 0.7849076085281711 and parameters: {'n_estimators': 255, 'learning_rate': 0.06326279329432156, 'depth': 7, 'subsample': 0.3776788176269785, 'colsample_bylevel': 0.7593104146787781, 'min_data_in_leaf': 77, 'verbose': False}. Best is trial 24 with value: 0.7849076085281711.


Best trial: 24. Best value: 0.784908:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [13:40<02:15, 33.96s/it]

[I 2023-10-14 22:32:11,818] Trial 25 finished with value: 0.7844467817610218 and parameters: {'n_estimators': 271, 'learning_rate': 0.06551254174303534, 'depth': 9, 'subsample': 0.40469339684183714, 'colsample_bylevel': 0.7705364102018127, 'min_data_in_leaf': 78, 'verbose': False}. Best is trial 24 with value: 0.7849076085281711.


Best trial: 24. Best value: 0.784908:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [14:42<02:07, 42.34s/it]

[I 2023-10-14 22:33:13,705] Trial 26 finished with value: 0.7844458684824633 and parameters: {'n_estimators': 270, 'learning_rate': 0.07571584755400582, 'depth': 9, 'subsample': 0.4541182587705995, 'colsample_bylevel': 0.8470922443151384, 'min_data_in_leaf': 79, 'verbose': False}. Best is trial 24 with value: 0.7849076085281711.


Best trial: 27. Best value: 0.785299:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [15:05<01:13, 36.78s/it]

[I 2023-10-14 22:33:37,495] Trial 27 finished with value: 0.7852990597094218 and parameters: {'n_estimators': 286, 'learning_rate': 0.07110901618444579, 'depth': 5, 'subsample': 0.4940134861055049, 'colsample_bylevel': 0.9090142712877507, 'min_data_in_leaf': 62, 'verbose': False}. Best is trial 27 with value: 0.7852990597094218.


Best trial: 27. Best value: 0.785299:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [15:22<00:30, 30.82s/it]

[I 2023-10-14 22:33:54,424] Trial 28 finished with value: 0.7841275549958918 and parameters: {'n_estimators': 229, 'learning_rate': 0.031036545216701076, 'depth': 5, 'subsample': 0.47397145939865326, 'colsample_bylevel': 0.9019512672309632, 'min_data_in_leaf': 63, 'verbose': False}. Best is trial 27 with value: 0.7852990597094218.


Best trial: 27. Best value: 0.785299: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [15:45<00:00, 31.51s/it]

[I 2023-10-14 22:34:16,898] Trial 29 finished with value: 0.7848925940470541 and parameters: {'n_estimators': 301, 'learning_rate': 0.07102455993430254, 'depth': 3, 'subsample': 0.5511355510787153, 'colsample_bylevel': 0.9213696062365356, 'min_data_in_leaf': 48, 'verbose': False}. Best is trial 27 with value: 0.7852990597094218.





In [7]:
study.best_params, study.best_value

({'n_estimators': 286,
  'learning_rate': 0.07110901618444579,
  'depth': 5,
  'subsample': 0.4940134861055049,
  'colsample_bylevel': 0.9090142712877507,
  'min_data_in_leaf': 62,
  'verbose': False},
 0.7852990597094218)

In [8]:
pipeline = make_pipeline(partial_pipeline, CatBoostClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)