In [1]:
import optuna
import os
import kaggle
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)


VERSION = 5
OUTPUT = f'extra_tree_clf_submisson_v{VERSION}.csv'

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

seed = 42
splits = 5

np.random.seed(seed)

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), StandardScaler())

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [seed]),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'criterion': trial.suggest_categorical("criterion", ["gini"]),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2"]),
    }

    pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=splits, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2023-10-14 17:26:50,029] A new study created in memory with name: no-name-06784de6-1647-4460-9137-0d9e113e1baa
Best trial: 0. Best value: 0.777077:   3%|█████▍                                                                                                                                                              | 1/30 [00:26<12:56, 26.77s/it]

[I 2023-10-14 17:27:16,793] Trial 0 finished with value: 0.7770765089285907 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 27, 'min_samples_split': 39, 'min_samples_leaf': 57, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7770765089285907.


Best trial: 1. Best value: 0.778301:   7%|██████████▉                                                                                                                                                         | 2/30 [00:47<10:46, 23.10s/it]

[I 2023-10-14 17:27:37,326] Trial 1 finished with value: 0.7783006270366193 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 13, 'min_samples_split': 59, 'min_samples_leaf': 9, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 1 with value: 0.7783006270366193.


Best trial: 2. Best value: 0.778565:  10%|████████████████▍                                                                                                                                                   | 3/30 [01:38<16:09, 35.91s/it]

[I 2023-10-14 17:28:28,489] Trial 2 finished with value: 0.7785646754260049 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 37, 'min_samples_split': 129, 'min_samples_leaf': 23, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  13%|█████████████████████▊                                                                                                                                              | 4/30 [01:46<10:49, 25.00s/it]

[I 2023-10-14 17:28:36,750] Trial 3 finished with value: 0.7772203176413137 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 26, 'min_samples_split': 110, 'min_samples_leaf': 44, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [02:28<12:59, 31.19s/it]

[I 2023-10-14 17:29:18,933] Trial 4 finished with value: 0.7784647402853275 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 34, 'min_samples_split': 137, 'min_samples_leaf': 21, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [02:50<11:06, 27.78s/it]

[I 2023-10-14 17:29:40,092] Trial 5 finished with value: 0.7773547646992697 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 33, 'min_samples_split': 146, 'min_samples_leaf': 49, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [03:05<09:05, 23.72s/it]

[I 2023-10-14 17:29:55,456] Trial 6 finished with value: 0.7774927630041856 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 25, 'min_samples_split': 133, 'min_samples_leaf': 40, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [03:49<11:01, 30.05s/it]

[I 2023-10-14 17:30:39,069] Trial 7 finished with value: 0.7781917283886509 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 18, 'min_samples_split': 43, 'min_samples_leaf': 25, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [04:28<11:35, 33.14s/it]

[I 2023-10-14 17:31:19,002] Trial 8 finished with value: 0.7773480195528815 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 24, 'min_samples_split': 13, 'min_samples_leaf': 53, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 2. Best value: 0.778565:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [04:55<10:20, 31.03s/it]

[I 2023-10-14 17:31:45,288] Trial 9 finished with value: 0.7770405842569208 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 20, 'min_samples_split': 50, 'min_samples_leaf': 57, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7785646754260049.


Best trial: 10. Best value: 0.781223:  37%|███████████████████████████████████████████████████████████▍                                                                                                      | 11/30 [05:45<11:43, 37.01s/it]

[I 2023-10-14 17:32:35,870] Trial 10 finished with value: 0.7812228020938061 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 48, 'min_samples_split': 99, 'min_samples_leaf': 2, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 10 with value: 0.7812228020938061.


Best trial: 11. Best value: 0.782509:  40%|████████████████████████████████████████████████████████████████▊                                                                                                 | 12/30 [06:39<12:38, 42.16s/it]

[I 2023-10-14 17:33:29,790] Trial 11 finished with value: 0.7825094059676625 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 99, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  43%|██████████████████████████████████████████████████████████████████████▏                                                                                           | 13/30 [07:38<13:23, 47.26s/it]

[I 2023-10-14 17:34:28,811] Trial 12 finished with value: 0.7824522151499135 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 92, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  47%|███████████████████████████████████████████████████████████████████████████▌                                                                                      | 14/30 [08:39<13:40, 51.30s/it]

[I 2023-10-14 17:35:29,428] Trial 13 finished with value: 0.7823718104761588 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 86, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  50%|█████████████████████████████████████████████████████████████████████████████████                                                                                 | 15/30 [09:27<12:33, 50.26s/it]

[I 2023-10-14 17:36:17,275] Trial 14 finished with value: 0.7791699087664055 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 42, 'min_samples_split': 75, 'min_samples_leaf': 13, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 16/30 [10:03<10:44, 46.05s/it]

[I 2023-10-14 17:36:53,561] Trial 15 finished with value: 0.7793928685907925 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 43, 'min_samples_split': 110, 'min_samples_leaf': 11, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  57%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 17/30 [10:43<09:34, 44.19s/it]

[I 2023-10-14 17:37:33,411] Trial 16 finished with value: 0.7780074256265281 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 44, 'min_samples_split': 84, 'min_samples_leaf': 33, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [10:54<06:50, 34.20s/it]

[I 2023-10-14 17:37:44,369] Trial 17 finished with value: 0.7695855092356829 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 4, 'min_samples_split': 69, 'min_samples_leaf': 16, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [11:58<07:54, 43.09s/it]

[I 2023-10-14 17:38:48,178] Trial 18 finished with value: 0.7800057702461386 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 38, 'min_samples_split': 102, 'min_samples_leaf': 6, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [12:22<06:15, 37.56s/it]

[I 2023-10-14 17:39:12,849] Trial 19 finished with value: 0.777950613007911 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 47, 'min_samples_split': 122, 'min_samples_leaf': 31, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [12:42<04:48, 32.05s/it]

[I 2023-10-14 17:39:32,037] Trial 20 finished with value: 0.7785562367851422 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 31, 'min_samples_split': 3, 'min_samples_leaf': 17, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [13:41<05:22, 40.28s/it]

[I 2023-10-14 17:40:31,516] Trial 21 finished with value: 0.7823853121075968 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 88, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [14:34<05:07, 43.98s/it]

[I 2023-10-14 17:41:24,113] Trial 22 finished with value: 0.7798543958708846 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 46, 'min_samples_split': 89, 'min_samples_leaf': 6, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [15:40<05:03, 50.67s/it]

[I 2023-10-14 17:42:30,386] Trial 23 finished with value: 0.7813594994869443 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 116, 'min_samples_leaf': 2, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [16:25<04:05, 49.13s/it]

[I 2023-10-14 17:43:15,936] Trial 24 finished with value: 0.7796882312019255 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 40, 'min_samples_split': 97, 'min_samples_leaf': 7, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [17:28<03:32, 53.12s/it]

[I 2023-10-14 17:44:18,367] Trial 25 finished with value: 0.7819311924611687 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 45, 'min_samples_split': 65, 'min_samples_leaf': 1, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [18:13<02:32, 50.72s/it]

[I 2023-10-14 17:45:03,496] Trial 26 finished with value: 0.7790177030504081 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 50, 'min_samples_split': 76, 'min_samples_leaf': 14, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [18:52<01:34, 47.11s/it]

[I 2023-10-14 17:45:42,188] Trial 27 finished with value: 0.779883887933486 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 40, 'min_samples_split': 91, 'min_samples_leaf': 6, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [19:39<00:47, 47.20s/it]

[I 2023-10-14 17:46:29,580] Trial 28 finished with value: 0.7787958038800475 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 47, 'min_samples_split': 110, 'min_samples_leaf': 18, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 11 with value: 0.7825094059676625.


Best trial: 11. Best value: 0.782509: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [20:16<00:00, 40.56s/it]

[I 2023-10-14 17:47:06,735] Trial 29 finished with value: 0.7785635138411242 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 36, 'min_samples_split': 26, 'min_samples_leaf': 10, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 11 with value: 0.7825094059676625.





In [7]:
study.best_params, study.best_value

({'n_estimators': 500,
  'n_jobs': -1,
  'random_state': 42,
  'max_depth': 50,
  'min_samples_split': 99,
  'min_samples_leaf': 1,
  'criterion': 'gini',
  'max_features': 'log2'},
 0.7825094059676625)

In [8]:
pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)