In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv(r'./input/playground-series-s3e23/train.csv', index_col = 'id')
test = pd.read_csv(r'./input/playground-series-s3e23/test.csv', index_col = 'id')
orig_train = pd.read_csv(r'./input/software-defect-prediction/jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

seed = 42
splits = 5
skf = StratifiedKFold(n_splits = splits, random_state = seed, shuffle = True)
np.random.seed(seed)

In [5]:
log_transformer = FunctionTransformer(func=np.log1p, validate=False)

In [7]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "verbose": False
    }
    pipeline = make_pipeline(SimpleImputer(), log_transformer, CatBoostClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=3, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15, show_progress_bar=True)

[I 2023-10-12 20:16:48,891] A new study created in memory with name: no-name-342c7ece-06c8-491c-a64d-32a8100c73e3
Best trial: 0. Best value: 0.784567:   7%|██████████▉                                                                                                                                                         | 1/15 [00:31<07:20, 31.43s/it]

[I 2023-10-12 20:17:20,320] Trial 0 finished with value: 0.7845668250338015 and parameters: {'learning_rate': 0.015433485632817637, 'depth': 3, 'subsample': 0.5994512892698309, 'colsample_bylevel': 0.6820943238369086, 'min_data_in_leaf': 96}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 0. Best value: 0.784567:  13%|█████████████████████▊                                                                                                                                              | 2/15 [01:00<06:31, 30.15s/it]

[I 2023-10-12 20:17:49,575] Trial 1 finished with value: 0.7773225589598418 and parameters: {'learning_rate': 0.0016996688527254667, 'depth': 3, 'subsample': 0.4426191004810119, 'colsample_bylevel': 0.25846497536189517, 'min_data_in_leaf': 85}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 0. Best value: 0.784567:  20%|████████████████████████████████▊                                                                                                                                   | 3/15 [01:51<07:56, 39.69s/it]

[I 2023-10-12 20:18:40,608] Trial 2 finished with value: 0.7841051731642699 and parameters: {'learning_rate': 0.00572805767748959, 'depth': 7, 'subsample': 0.4374185389371829, 'colsample_bylevel': 0.6030619871656083, 'min_data_in_leaf': 45}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 0. Best value: 0.784567:  27%|███████████████████████████████████████████▋                                                                                                                        | 4/15 [03:32<11:42, 63.88s/it]

[I 2023-10-12 20:20:21,588] Trial 3 finished with value: 0.7842692701946148 and parameters: {'learning_rate': 0.005809626160975966, 'depth': 9, 'subsample': 0.7820104683235691, 'colsample_bylevel': 0.5335076909380585, 'min_data_in_leaf': 87}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 0. Best value: 0.784567:  33%|██████████████████████████████████████████████████████▋                                                                                                             | 5/15 [04:14<09:17, 55.76s/it]

[I 2023-10-12 20:21:02,944] Trial 4 finished with value: 0.7835061517630825 and parameters: {'learning_rate': 0.007754491361604798, 'depth': 6, 'subsample': 0.8111977561305797, 'colsample_bylevel': 0.16910819912190966, 'min_data_in_leaf': 26}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 0. Best value: 0.784567:  40%|█████████████████████████████████████████████████████████████████▌                                                                                                  | 6/15 [04:36<06:38, 44.32s/it]

[I 2023-10-12 20:21:25,047] Trial 5 finished with value: 0.7656533058824962 and parameters: {'learning_rate': 0.0032419422127156635, 'depth': 1, 'subsample': 0.8315346512983418, 'colsample_bylevel': 0.0728584795431291, 'min_data_in_leaf': 99}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 0. Best value: 0.784567:  47%|████████████████████████████████████████████████████████████████████████████▌                                                                                       | 7/15 [05:13<05:35, 41.88s/it]

[I 2023-10-12 20:22:01,895] Trial 6 finished with value: 0.7822327610390428 and parameters: {'learning_rate': 0.006893512826319067, 'depth': 5, 'subsample': 0.7889114387580182, 'colsample_bylevel': 0.11984531235550597, 'min_data_in_leaf': 68}. Best is trial 0 with value: 0.7845668250338015.


Best trial: 7. Best value: 0.784981:  53%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 8/15 [05:45<04:32, 38.93s/it]

[I 2023-10-12 20:22:34,514] Trial 7 finished with value: 0.7849805823612998 and parameters: {'learning_rate': 0.06071565986189151, 'depth': 3, 'subsample': 0.4172312144014895, 'colsample_bylevel': 0.8284177282350561, 'min_data_in_leaf': 20}. Best is trial 7 with value: 0.7849805823612998.


Best trial: 8. Best value: 0.78499:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 9/15 [06:17<03:39, 36.66s/it]

[I 2023-10-12 20:23:06,202] Trial 8 finished with value: 0.7849904094356305 and parameters: {'learning_rate': 0.05936727513367089, 'depth': 3, 'subsample': 0.7759613496902761, 'colsample_bylevel': 0.2108093870520839, 'min_data_in_leaf': 2}. Best is trial 8 with value: 0.7849904094356305.


Best trial: 8. Best value: 0.78499:  67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 10/15 [06:54<03:03, 36.70s/it]

[I 2023-10-12 20:23:42,989] Trial 9 finished with value: 0.7847339208430387 and parameters: {'learning_rate': 0.02650843188807988, 'depth': 8, 'subsample': 0.6672513664728279, 'colsample_bylevel': 0.0844943337087976, 'min_data_in_leaf': 46}. Best is trial 8 with value: 0.7849904094356305.


Best trial: 8. Best value: 0.78499:  73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 11/15 [07:18<02:11, 32.90s/it]

[I 2023-10-12 20:24:07,281] Trial 10 finished with value: 0.7833242660043669 and parameters: {'learning_rate': 0.09664547599249979, 'depth': 1, 'subsample': 0.987289911202284, 'colsample_bylevel': 0.3892650383069759, 'min_data_in_leaf': 1}. Best is trial 8 with value: 0.7849904094356305.


Best trial: 8. Best value: 0.78499:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 12/15 [07:53<01:40, 33.47s/it]

[I 2023-10-12 20:24:42,052] Trial 11 finished with value: 0.7846023420977174 and parameters: {'learning_rate': 0.07618393118605145, 'depth': 4, 'subsample': 0.18729757734581665, 'colsample_bylevel': 0.9085305572055458, 'min_data_in_leaf': 1}. Best is trial 8 with value: 0.7849904094356305.


Best trial: 8. Best value: 0.78499:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 13/15 [08:24<01:05, 32.81s/it]

[I 2023-10-12 20:25:13,325] Trial 12 finished with value: 0.7848360747348918 and parameters: {'learning_rate': 0.03462027728216542, 'depth': 3, 'subsample': 0.34201366411143314, 'colsample_bylevel': 0.784859906506003, 'min_data_in_leaf': 22}. Best is trial 8 with value: 0.7849904094356305.


Best trial: 8. Best value: 0.78499:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 14/15 [08:46<00:29, 29.63s/it]

[I 2023-10-12 20:25:35,631] Trial 13 finished with value: 0.7829285521277676 and parameters: {'learning_rate': 0.048271272243876694, 'depth': 2, 'subsample': 0.051334093974315875, 'colsample_bylevel': 0.4368400085690136, 'min_data_in_leaf': 19}. Best is trial 8 with value: 0.7849904094356305.


Best trial: 14. Best value: 0.785159: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [09:27<00:00, 37.83s/it]

[I 2023-10-12 20:26:16,308] Trial 14 finished with value: 0.7851591124237811 and parameters: {'learning_rate': 0.020657979308493238, 'depth': 5, 'subsample': 0.5275845310391561, 'colsample_bylevel': 0.9516480399926811, 'min_data_in_leaf': 13}. Best is trial 14 with value: 0.7851591124237811.





In [11]:
study.best_params, study.best_value

({'learning_rate': 0.020657979308493238,
  'depth': 5,
  'subsample': 0.5275845310391561,
  'colsample_bylevel': 0.9516480399926811,
  'min_data_in_leaf': 13},
 0.7851591124237811)

In [12]:
pipeline = make_pipeline(SimpleImputer(), log_transformer, CatBoostClassifier(**study.best_params))
pipeline.fit(X, y)

0:	learn: 0.6812670	total: 68.1ms	remaining: 1m 8s
1:	learn: 0.6699953	total: 76.1ms	remaining: 38s
2:	learn: 0.6592704	total: 85.8ms	remaining: 28.5s
3:	learn: 0.6490881	total: 94.2ms	remaining: 23.4s
4:	learn: 0.6392913	total: 102ms	remaining: 20.4s
5:	learn: 0.6298177	total: 112ms	remaining: 18.6s
6:	learn: 0.6206378	total: 121ms	remaining: 17.1s
7:	learn: 0.6121842	total: 128ms	remaining: 15.9s
8:	learn: 0.6040736	total: 137ms	remaining: 15.1s
9:	learn: 0.5963201	total: 145ms	remaining: 14.4s
10:	learn: 0.5891448	total: 154ms	remaining: 13.9s
11:	learn: 0.5821423	total: 162ms	remaining: 13.3s
12:	learn: 0.5756564	total: 170ms	remaining: 12.9s
13:	learn: 0.5694192	total: 178ms	remaining: 12.5s
14:	learn: 0.5634226	total: 191ms	remaining: 12.5s
15:	learn: 0.5577499	total: 201ms	remaining: 12.4s
16:	learn: 0.5522611	total: 212ms	remaining: 12.3s
17:	learn: 0.5469793	total: 223ms	remaining: 12.2s
18:	learn: 0.5421200	total: 231ms	remaining: 11.9s
19:	learn: 0.5373592	total: 240ms	remai

In [13]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv('catboost_submission.csv')