In [1]:
import optuna
import os
from pathlib import Path
import kaggle
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingClassifier


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 1
OUTPUT = f'light_gbm_gbdt_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
def objective(trial):
    params = {
        'boosting_type ': trial.suggest_categorical('boosting_type', ['gbdt']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced']),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 100),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0, log=True),
        # 'verbose': trial.suggest_categorical('verbose', [0]),
        'device': trial.suggest_categorical('device', ['gpu']),
    }
    pipeline = make_pipeline(partial_pipeline, LGBMClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

[I 2023-10-17 09:23:02,288] A new study created in memory with name: no-name-76069e80-6426-4a8e-b05a-345432c1f26d
  0%|                                                                                                                                                                                                                 | 0/30 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3635
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.72 MB) transferred to GPU in 0.022222 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3635
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightG

Best trial: 0. Best value: 0.783308:   3%|█████▎                                                                                                                                                           | 1/30 [11:55<5:45:57, 715.78s/it]

[I 2023-10-17 09:34:58,072] Trial 0 finished with value: 0.7833077944895666 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 63, 'max_depth': 36, 'learning_rate': 0.0716688439641843, 'n_estimators': 230, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 32, 'reg_alpha': 54, 'colsample_bytree': 0.2134521289805766, 'subsample': 0.3037268736722141, 'device': 'gpu'}. Best is trial 0 with value: 0.7833077944895666.


Best trial: 1. Best value: 0.785555:   7%|██████████▋                                                                                                                                                      | 2/30 [17:33<3:50:15, 493.42s/it]

[I 2023-10-17 09:40:35,835] Trial 1 finished with value: 0.7855549700663521 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 11, 'max_depth': 34, 'learning_rate': 0.03906741487547384, 'n_estimators': 212, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 10, 'reg_alpha': 18, 'colsample_bytree': 0.37320602231441363, 'subsample': 0.44773200548130776, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3635
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.7

Best trial: 1. Best value: 0.785555:  10%|████████████████                                                                                                                                                 | 3/30 [27:03<3:57:40, 528.15s/it]

[I 2023-10-17 09:50:05,314] Trial 2 finished with value: 0.7831881905727418 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 40, 'max_depth': 5, 'learning_rate': 0.04928254924960047, 'n_estimators': 193, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 12, 'reg_alpha': 3, 'colsample_bytree': 0.19171925103673507, 'subsample': 0.38159552815382597, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69983
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3639
[LightGBM] [Info] Number of data points in the train set: 90119, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.72 

Best trial: 1. Best value: 0.785555:  13%|█████████████████████▍                                                                                                                                           | 4/30 [39:17<4:24:06, 609.46s/it]

[I 2023-10-17 10:02:19,435] Trial 3 finished with value: 0.7802341362492913 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 77, 'max_depth': 6, 'learning_rate': 0.016617526588364778, 'n_estimators': 285, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 42, 'reg_alpha': 27, 'colsample_bytree': 0.16169371583844175, 'subsample': 0.11643586246481127, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3629
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.7

Best trial: 1. Best value: 0.785555:  17%|██████████████████████████▊                                                                                                                                      | 5/30 [44:24<3:28:36, 500.66s/it]

[I 2023-10-17 10:07:27,169] Trial 4 finished with value: 0.7806789205137766 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 92, 'max_depth': 21, 'learning_rate': 0.08095230387860818, 'n_estimators': 267, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 56, 'reg_alpha': 87, 'colsample_bytree': 0.10650335877548193, 'subsample': 0.17695686893243773, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3635
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.7

Best trial: 1. Best value: 0.785555:  20%|████████████████████████████████▏                                                                                                                                | 6/30 [54:16<3:32:41, 531.72s/it]

[I 2023-10-17 10:17:19,173] Trial 5 finished with value: 0.7770229080934575 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 48, 'max_depth': 7, 'learning_rate': 0.012439373921237019, 'n_estimators': 248, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 79, 'reg_alpha': 50, 'colsample_bytree': 0.12323727922815307, 'subsample': 0.10696065643046433, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69983
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3639
[LightGBM] [Info] Number of data points in the train set: 90119, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.7

Best trial: 1. Best value: 0.785555:  23%|█████████████████████████████████████                                                                                                                          | 7/30 [1:06:44<3:50:54, 602.36s/it]

[I 2023-10-17 10:29:46,978] Trial 6 finished with value: 0.7756665004867409 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 85, 'max_depth': 42, 'learning_rate': 0.012833925054111328, 'n_estimators': 139, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 47, 'reg_alpha': 17, 'colsample_bytree': 0.12665785029502782, 'subsample': 0.5433855079580213, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3629
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.7

Best trial: 1. Best value: 0.785555:  27%|██████████████████████████████████████████▍                                                                                                                    | 8/30 [1:27:20<4:54:48, 804.02s/it]

[I 2023-10-17 10:50:22,790] Trial 7 finished with value: 0.7816792663767682 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 40, 'max_depth': 42, 'learning_rate': 0.014938901855574697, 'n_estimators': 261, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 69, 'reg_alpha': 49, 'colsample_bytree': 0.2395132170612047, 'subsample': 0.9180751016136028, 'device': 'gpu'}. Best is trial 1 with value: 0.7855549700663521.
[LightGBM] [Info] Number of positive: 20136, number of negative: 69982
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3635
[LightGBM] [Info] Number of data points in the train set: 90118, number of used features: 21
[LightGBM] [Info] Using GPU Device: pthread-Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz, Vendor: GenuineIntel
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 18 dense feature groups (1.72

Best trial: 1. Best value: 0.785555:  27%|██████████████████████████████████████████▍                                                                                                                    | 8/30 [1:46:33<4:53:01, 799.14s/it]


[W 2023-10-17 11:09:35,394] Trial 8 failed with parameters: {'boosting_type': 'gbdt', 'num_leaves': 67, 'max_depth': 34, 'learning_rate': 0.061128978968157645, 'n_estimators': 411, 'class_weight': 'balanced', 'n_jobs': -1, 'random_state': 42, 'reg_lambda': 37, 'reg_alpha': 12, 'colsample_bytree': 0.12290651382978056, 'subsample': 0.5355079909913046, 'device': 'gpu'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/strawhatdragon100/miniconda3/envs/kaggle/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_953/251557043.py", line 19, in objective
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/strawhatdragon100/miniconda3/envs/kaggle/lib/python3.11/site-packages/sklearn/model_selec

KeyboardInterrupt: 

In [None]:
study.best_params, study.best_value

In [None]:
pipeline = make_pipeline(partial_pipeline, LGBMClassifier(**study.best_params))
pipeline.fit(X, y)

In [None]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)