In [1]:
import optuna
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('playground-series-s3e23')
    orig_path = Path('software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

seed = 42
splits = 5
skf = StratifiedKFold(n_splits = splits, random_state = seed, shuffle = True)
np.random.seed(seed)

In [5]:
log_transformer = FunctionTransformer(func=np.log1p, validate=False)

In [None]:
partial_pipeline = make_pipeline(SimpleImputer(), log_transformer, StandardScaler())

In [8]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [seed]),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }

    pipeline = make_pipeline(partial_pipeline, XGBClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=3, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15, show_progress_bar=True)

[I 2023-10-12 18:36:40,248] A new study created in memory with name: no-name-59e05e7a-8026-48fd-99d9-99e36dff9805
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:   7%|██████████▋                                                                                                                                                      | 1/15 [05:18<1:14:21, 318.68s/it]

[I 2023-10-12 18:41:58,927] Trial 0 finished with value: 0.7802572415437462 and parameters: {'lambda': 0.038721308691319005, 'alpha': 0.002234181988950396, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 181}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  13%|█████████████████████▋                                                                                                                                             | 2/15 [08:54<55:55, 258.10s/it]

[I 2023-10-12 18:45:34,620] Trial 1 finished with value: 0.7721428013359716 and parameters: {'lambda': 0.6834867117482265, 'alpha': 0.12267535284709595, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.018, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 25}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  20%|████████████████████████████████▌                                                                                                                                  | 3/15 [14:30<58:42, 293.53s/it]

[I 2023-10-12 18:51:10,320] Trial 2 finished with value: 0.7755921033622606 and parameters: {'lambda': 0.5775091086995751, 'alpha': 0.005885672302969961, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.014, 'max_depth': 9, 'random_state': 2020, 'min_child_weight': 140}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  27%|███████████████████████████████████████████▍                                                                                                                       | 4/15 [18:45<51:05, 278.65s/it]

[I 2023-10-12 18:55:26,143] Trial 3 finished with value: 0.7706074537719186 and parameters: {'lambda': 0.055847379400529806, 'alpha': 0.03506842552149459, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 124}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 5/15 [25:03<52:24, 314.46s/it]

[I 2023-10-12 19:01:44,118] Trial 4 finished with value: 0.7656943595604613 and parameters: {'lambda': 0.039247031603664474, 'alpha': 0.0013869649333067607, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.016, 'max_depth': 9, 'random_state': 2020, 'min_child_weight': 69}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 6/15 [28:01<40:11, 267.98s/it]

[I 2023-10-12 19:04:41,859] Trial 5 finished with value: 0.7761839832359626 and parameters: {'lambda': 0.0021370979960928065, 'alpha': 2.2960004946864636, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.018, 'max_depth': 9, 'random_state': 2020, 'min_child_weight': 244}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 7/15 [33:34<38:34, 289.25s/it]

[I 2023-10-12 19:10:14,905] Trial 6 finished with value: 0.7497577307163484 and parameters: {'lambda': 0.0018594709498623127, 'alpha': 3.4340702757965134, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 15}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  53%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 8/15 [36:33<29:38, 254.04s/it]

[I 2023-10-12 19:13:13,542] Trial 7 finished with value: 0.7712720705742625 and parameters: {'lambda': 0.010643526239697231, 'alpha': 0.0040352045790819175, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 9, 'random_state': 2020, 'min_child_weight': 100}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 9/15 [38:33<21:13, 212.31s/it]

[I 2023-10-12 19:15:14,116] Trial 8 finished with value: 0.7797511845172513 and parameters: {'lambda': 0.08884658574565824, 'alpha': 0.022248122370719117, 'colsample_bytree': 0.9, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 250}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 0. Best value: 0.780257:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 10/15 [42:27<18:14, 218.92s/it]

[I 2023-10-12 19:19:07,822] Trial 9 finished with value: 0.7776352490070865 and parameters: {'lambda': 0.06049269262691474, 'alpha': 0.05122697490434363, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 268}. Best is trial 0 with value: 0.7802572415437462.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[I 2023-10-12 19:21:34,455] Trial 10 finished with value: 0.7822475281551701 and parameters: {'lambda': 7.152659399330523, 'alpha': 0.0017930225817956794, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 193}. Best is trial 10 with value: 0.7822475281551701.


Best trial: 11. Best value: 0.782281:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                | 12/15 [47:21<09:04, 181.65s/it]

[I 2023-10-12 19:24:01,452] Trial 11 finished with value: 0.7822810650792745 and parameters: {'lambda': 6.92479678295337, 'alpha': 0.0016970742112315522, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 193}. Best is trial 11 with value: 0.7822810650792745.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 11. Best value: 0.782281:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                     | 13/15 [49:50<05:43, 171.90s/it]

[I 2023-10-12 19:26:30,931] Trial 12 finished with value: 0.781638096898329 and parameters: {'lambda': 9.536789257111334, 'alpha': 0.0010880289210026032, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 17, 'random_state': 2020, 'min_child_weight': 194}. Best is trial 11 with value: 0.7822810650792745.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 11. Best value: 0.782281:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 14/15 [52:26<02:47, 167.17s/it]

[I 2023-10-12 19:29:07,153] Trial 13 finished with value: 0.7819635680288797 and parameters: {'lambda': 8.998327524304006, 'alpha': 0.011142578300533888, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 13, 'random_state': 2020, 'min_child_weight': 196}. Best is trial 11 with value: 0.7822810650792745.


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
Best trial: 11. Best value: 0.782281: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [54:48<00:00, 219.20s/it]

[I 2023-10-12 19:31:28,261] Trial 14 finished with value: 0.7819319954355936 and parameters: {'lambda': 2.0129141475638384, 'alpha': 0.006031039568808718, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 7, 'random_state': 2020, 'min_child_weight': 298}. Best is trial 11 with value: 0.7822810650792745.





In [9]:
study.best_params, study.best_value

({'lambda': 6.92479678295337,
  'alpha': 0.0016970742112315522,
  'colsample_bytree': 0.3,
  'subsample': 0.4,
  'learning_rate': 0.01,
  'max_depth': 11,
  'random_state': 2020,
  'min_child_weight': 193},
 0.7822810650792745)

In [10]:
pipeline = make_pipeline(SimpleImputer(), log_transformer, XGBClassifier(**study.best_params))
pipeline.fit(X, y)

  if is_sparse(data):


In [11]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv('xgboost_submission.csv')