In [1]:
from pathlib import Path
import os
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
import optuna

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

VERSION = 5
OUTPUT = f'extra_tree_clf_submisson_v{VERSION}.csv'



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

seed = 42
splits = 5
np.random.seed(seed)

In [5]:
partial_pipeline = make_pipeline(ColumnTransformer([('drop', 'drop',
                                  ['iv(g)', 't', 'b', 'n',
                                   'lOCode', 'v', 'branchCount',
                                   'e', 'i', 'lOComment'])],
                                remainder='passthrough'), SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [seed]),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'criterion': trial.suggest_categorical("criterion", ["gini"]),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2"]),
    }

    pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=splits, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2023-10-14 17:50:49,053] A new study created in memory with name: no-name-4fdde989-0b97-47ad-9590-103c91aa327f
Best trial: 0. Best value: 0.780492:   3%|█████▍                                                                                                                                                              | 1/30 [00:30<14:43, 30.45s/it]

[I 2023-10-14 17:51:19,505] Trial 0 finished with value: 0.7804917426976707 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 26, 'min_samples_split': 144, 'min_samples_leaf': 14, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7804917426976707.


Best trial: 0. Best value: 0.780492:   7%|██████████▉                                                                                                                                                         | 2/30 [00:42<09:11, 19.69s/it]

[I 2023-10-14 17:51:31,660] Trial 1 finished with value: 0.7796452845399605 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 38, 'min_samples_split': 105, 'min_samples_leaf': 32, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7804917426976707.


Best trial: 0. Best value: 0.780492:  10%|████████████████▍                                                                                                                                                   | 3/30 [01:02<08:58, 19.95s/it]

[I 2023-10-14 17:51:51,921] Trial 2 finished with value: 0.7796925703639597 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 13, 'min_samples_split': 76, 'min_samples_leaf': 17, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 0 with value: 0.7804917426976707.


Best trial: 0. Best value: 0.780492:  13%|█████████████████████▊                                                                                                                                              | 4/30 [01:17<07:46, 17.94s/it]

[I 2023-10-14 17:52:06,771] Trial 3 finished with value: 0.7790733995197823 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 21, 'min_samples_split': 119, 'min_samples_leaf': 52, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7804917426976707.


Best trial: 4. Best value: 0.781251:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [01:24<05:43, 13.75s/it]

[I 2023-10-14 17:52:13,101] Trial 4 finished with value: 0.7812508602581131 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 14, 'min_samples_split': 128, 'min_samples_leaf': 5, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [01:38<05:36, 14.04s/it]

[I 2023-10-14 17:52:27,705] Trial 5 finished with value: 0.7790074161956337 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 108, 'min_samples_leaf': 56, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [02:11<07:44, 20.19s/it]

[I 2023-10-14 17:53:00,549] Trial 6 finished with value: 0.7802252706241868 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 23, 'min_samples_split': 132, 'min_samples_leaf': 19, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [02:29<07:09, 19.51s/it]

[I 2023-10-14 17:53:18,596] Trial 7 finished with value: 0.7782642262021832 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 12, 'min_samples_split': 30, 'min_samples_leaf': 50, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [03:01<08:13, 23.49s/it]

[I 2023-10-14 17:53:50,842] Trial 8 finished with value: 0.7807121997090128 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 42, 'min_samples_split': 53, 'min_samples_leaf': 9, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [03:16<06:56, 20.84s/it]

[I 2023-10-14 17:54:05,757] Trial 9 finished with value: 0.7790968035095102 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 24, 'min_samples_split': 41, 'min_samples_leaf': 55, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [03:21<05:00, 15.79s/it]

[I 2023-10-14 17:54:10,107] Trial 10 finished with value: 0.7751291023180628 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 7, 'min_samples_split': 81, 'min_samples_leaf': 3, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 12/30 [04:22<08:56, 29.79s/it]

[I 2023-10-14 17:55:11,917] Trial 11 finished with value: 0.7600738594852803 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 36, 'min_samples_split': 6, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  43%|██████████████████████████████████████████████████████████████████████▋                                                                                            | 13/30 [04:30<06:32, 23.08s/it]

[I 2023-10-14 17:55:19,557] Trial 12 finished with value: 0.7797465950615263 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 48, 'min_samples_split': 62, 'min_samples_leaf': 26, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 14/30 [05:05<07:07, 26.70s/it]

[I 2023-10-14 17:55:54,601] Trial 13 finished with value: 0.7810106767647726 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 36, 'min_samples_split': 52, 'min_samples_leaf': 7, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  50%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 15/30 [05:35<06:55, 27.70s/it]

[I 2023-10-14 17:56:24,636] Trial 14 finished with value: 0.7797678518643755 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 32, 'min_samples_split': 85, 'min_samples_leaf': 26, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  53%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 16/30 [05:46<05:18, 22.76s/it]

[I 2023-10-14 17:56:35,930] Trial 15 finished with value: 0.7794198392310536 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 17, 'min_samples_split': 12, 'min_samples_leaf': 39, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 4. Best value: 0.781251:  57%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 17/30 [05:52<03:49, 17.66s/it]

[I 2023-10-14 17:56:41,715] Trial 16 finished with value: 0.7692147953876102 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 4, 'min_samples_split': 30, 'min_samples_leaf': 9, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 4 with value: 0.7812508602581131.


Best trial: 17. Best value: 0.781337:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [06:19<04:06, 20.55s/it]

[I 2023-10-14 17:57:08,982] Trial 17 finished with value: 0.7813366107780195 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 30, 'min_samples_split': 96, 'min_samples_leaf': 8, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 17 with value: 0.7813366107780195.


Best trial: 17. Best value: 0.781337:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [06:58<04:44, 25.91s/it]

[I 2023-10-14 17:57:47,366] Trial 18 finished with value: 0.7793260180377273 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 30, 'min_samples_split': 149, 'min_samples_leaf': 42, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 17 with value: 0.7813366107780195.


Best trial: 17. Best value: 0.781337:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [07:23<04:15, 25.54s/it]

[I 2023-10-14 17:58:12,057] Trial 19 finished with value: 0.7804800766747313 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 18, 'min_samples_split': 98, 'min_samples_leaf': 14, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 17 with value: 0.7813366107780195.


Best trial: 17. Best value: 0.781337:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [08:00<04:22, 29.20s/it]

[I 2023-10-14 17:58:49,754] Trial 20 finished with value: 0.7799591564593461 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 29, 'min_samples_split': 125, 'min_samples_leaf': 23, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 17 with value: 0.7813366107780195.


Best trial: 17. Best value: 0.781337:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [08:55<04:56, 37.02s/it]

[I 2023-10-14 17:59:45,042] Trial 21 finished with value: 0.7811267302136781 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 34, 'min_samples_split': 61, 'min_samples_leaf': 7, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 17 with value: 0.7813366107780195.


Best trial: 22. Best value: 0.781725:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [10:06<05:30, 47.17s/it]

[I 2023-10-14 18:00:55,892] Trial 22 finished with value: 0.7817253634085903 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 42, 'min_samples_split': 94, 'min_samples_leaf': 6, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 22 with value: 0.7817253634085903.


Best trial: 23. Best value: 0.783608:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [11:06<05:04, 50.79s/it]

[I 2023-10-14 18:01:55,115] Trial 23 finished with value: 0.7836076785813241 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 44, 'min_samples_split': 92, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 23 with value: 0.7836076785813241.


Best trial: 24. Best value: 0.783621:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [12:16<04:43, 56.75s/it]

[I 2023-10-14 18:03:05,775] Trial 24 finished with value: 0.783621221190528 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 46, 'min_samples_split': 94, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 24 with value: 0.783621221190528.


Best trial: 25. Best value: 0.783645:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [13:03<03:34, 53.75s/it]

[I 2023-10-14 18:03:52,516] Trial 25 finished with value: 0.7836453346373033 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 45, 'min_samples_split': 90, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 25 with value: 0.7836453346373033.


Best trial: 26. Best value: 0.783913:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [14:02<02:45, 55.31s/it]

[I 2023-10-14 18:04:51,458] Trial 26 finished with value: 0.7839128458038738 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 45, 'min_samples_split': 112, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 26 with value: 0.7839128458038738.


Best trial: 26. Best value: 0.783913:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [14:41<01:40, 50.34s/it]

[I 2023-10-14 18:05:30,223] Trial 27 finished with value: 0.7807419588392888 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 45, 'min_samples_split': 114, 'min_samples_leaf': 13, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 26 with value: 0.7839128458038738.


Best trial: 26. Best value: 0.783913:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [15:17<00:46, 46.25s/it]

[I 2023-10-14 18:06:06,913] Trial 28 finished with value: 0.7810192553564007 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 40, 'min_samples_split': 135, 'min_samples_leaf': 11, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 26 with value: 0.7839128458038738.


Best trial: 26. Best value: 0.783913: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [16:17<00:00, 32.59s/it]

[I 2023-10-14 18:07:06,677] Trial 29 finished with value: 0.7802684629387135 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 46, 'min_samples_split': 72, 'min_samples_leaf': 18, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 26 with value: 0.7839128458038738.





In [7]:
study.best_params, study.best_value

({'n_estimators': 300,
  'n_jobs': -1,
  'random_state': 42,
  'max_depth': 45,
  'min_samples_split': 112,
  'min_samples_leaf': 1,
  'criterion': 'gini',
  'max_features': 'log2'},
 0.7839128458038738)

In [8]:
pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)