In [1]:
from pathlib import Path
import os
import kaggle
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
import optuna

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 6
OUTPUT = f'extra_tree_clf_submisson_v{VERSION}.csv'

np.random.seed(SEED)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

In [5]:
partial_pipeline = make_pipeline(ColumnTransformer([('drop', 'drop',
                                  ['iv(g)', 't', 'b', 'n',
                                   'lOCode', 'v', 'branchCount',
                                   'e', 'i', 'lOComment'])],
                                remainder='passthrough'), SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [SEED]),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'criterion': trial.suggest_categorical("criterion", ["gini"]),
        'max_features': trial.suggest_categorical('max_features', [1.0]),
    }

    pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=SPLITS, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2023-10-17 15:25:48,290] A new study created in memory with name: no-name-bb62fa7d-71db-4cac-961c-ec22fc4dd7d3
Best trial: 0. Best value: 0.783927:   3%|█████▍                                                                                                                                                              | 1/30 [01:26<41:40, 86.21s/it]

[I 2023-10-17 15:27:14,500] Trial 0 finished with value: 0.7839273647200006 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 19, 'min_samples_split': 69, 'min_samples_leaf': 18, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 0 with value: 0.7839273647200006.


Best trial: 1. Best value: 0.784094:   7%|██████████▉                                                                                                                                                         | 2/30 [02:39<36:39, 78.55s/it]

[I 2023-10-17 15:28:27,686] Trial 1 finished with value: 0.7840938716669501 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 47, 'min_samples_split': 43, 'min_samples_leaf': 34, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 1 with value: 0.7840938716669501.


Best trial: 2. Best value: 0.784205:  10%|████████████████▍                                                                                                                                                   | 3/30 [03:35<30:39, 68.14s/it]

[I 2023-10-17 15:29:23,448] Trial 2 finished with value: 0.7842050667843055 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 39, 'min_samples_split': 82, 'min_samples_leaf': 33, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 2 with value: 0.7842050667843055.


Best trial: 2. Best value: 0.784205:  13%|█████████████████████▊                                                                                                                                              | 4/30 [05:10<34:08, 78.79s/it]

[I 2023-10-17 15:30:58,551] Trial 3 finished with value: 0.7829620740590801 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 46, 'min_samples_split': 44, 'min_samples_leaf': 23, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 2 with value: 0.7842050667843055.


Best trial: 4. Best value: 0.785563:  17%|███████████████████████████▎                                                                                                                                        | 5/30 [05:29<23:49, 57.19s/it]

[I 2023-10-17 15:31:17,447] Trial 4 finished with value: 0.7855625436802366 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 14, 'min_samples_split': 2, 'min_samples_leaf': 32, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 4 with value: 0.7855625436802366.


Best trial: 4. Best value: 0.785563:  20%|████████████████████████████████▊                                                                                                                                   | 6/30 [05:54<18:30, 46.25s/it]

[I 2023-10-17 15:31:42,464] Trial 5 finished with value: 0.7774813710084968 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 4, 'min_samples_split': 55, 'min_samples_leaf': 20, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 4 with value: 0.7855625436802366.


Best trial: 6. Best value: 0.785617:  23%|██████████████████████████████████████▎                                                                                                                             | 7/30 [07:34<24:30, 63.95s/it]

[I 2023-10-17 15:33:22,840] Trial 6 finished with value: 0.785616690675097 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 14, 'min_samples_split': 44, 'min_samples_leaf': 54, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 6 with value: 0.785616690675097.


Best trial: 6. Best value: 0.785617:  27%|███████████████████████████████████████████▋                                                                                                                        | 8/30 [08:32<22:47, 62.18s/it]

[I 2023-10-17 15:34:21,227] Trial 7 finished with value: 0.7831967415318588 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 23, 'min_samples_split': 69, 'min_samples_leaf': 18, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 6 with value: 0.785616690675097.


Best trial: 6. Best value: 0.785617:  30%|█████████████████████████████████████████████████▏                                                                                                                  | 9/30 [09:10<19:02, 54.42s/it]

[I 2023-10-17 15:34:58,582] Trial 8 finished with value: 0.7843594244343708 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 22, 'min_samples_split': 61, 'min_samples_leaf': 37, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 6 with value: 0.785616690675097.


Best trial: 6. Best value: 0.785617:  33%|██████████████████████████████████████████████████████▎                                                                                                            | 10/30 [09:28<14:23, 43.17s/it]

[I 2023-10-17 15:35:16,576] Trial 9 finished with value: 0.7847708535914928 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 43, 'min_samples_split': 81, 'min_samples_leaf': 60, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 6 with value: 0.785616690675097.


Best trial: 6. Best value: 0.785617:  37%|███████████████████████████████████████████████████████████▊                                                                                                       | 11/30 [10:18<14:20, 45.29s/it]

[I 2023-10-17 15:36:06,648] Trial 10 finished with value: 0.7822131341466472 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 6, 'min_samples_split': 132, 'min_samples_leaf': 56, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 6 with value: 0.785616690675097.


Best trial: 11. Best value: 0.78566:  40%|█████████████████████████████████████████████████████████████████▏                                                                                                 | 12/30 [10:36<11:07, 37.06s/it]

[I 2023-10-17 15:36:24,889] Trial 11 finished with value: 0.7856603015316205 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 48, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 11 with value: 0.7856603015316205.


Best trial: 11. Best value: 0.78566:  43%|██████████████████████████████████████████████████████████████████████▋                                                                                            | 13/30 [12:30<17:04, 60.29s/it]

[I 2023-10-17 15:38:18,630] Trial 12 finished with value: 0.7847282658759613 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 34, 'min_samples_split': 7, 'min_samples_leaf': 48, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 11 with value: 0.7856603015316205.


Best trial: 11. Best value: 0.78566:  47%|████████████████████████████████████████████████████████████████████████████                                                                                       | 14/30 [13:04<13:57, 52.35s/it]

[I 2023-10-17 15:38:52,636] Trial 13 finished with value: 0.7853982297760211 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 12, 'min_samples_split': 25, 'min_samples_leaf': 2, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 11 with value: 0.7856603015316205.


Best trial: 11. Best value: 0.78566:  50%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 15/30 [13:44<12:08, 48.57s/it]

[I 2023-10-17 15:39:32,458] Trial 14 finished with value: 0.7846678022656134 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 30, 'min_samples_split': 26, 'min_samples_leaf': 47, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 11 with value: 0.7856603015316205.


Best trial: 15. Best value: 0.785717:  53%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 16/30 [15:02<13:25, 57.51s/it]

[I 2023-10-17 15:40:50,728] Trial 15 finished with value: 0.785717273334426 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 12, 'min_samples_split': 101, 'min_samples_leaf': 47, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  57%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 17/30 [16:00<12:29, 57.66s/it]

[I 2023-10-17 15:41:48,729] Trial 16 finished with value: 0.784529188541709 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 8, 'min_samples_split': 107, 'min_samples_leaf': 45, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 18/30 [17:13<12:28, 62.35s/it]

[I 2023-10-17 15:43:02,011] Trial 17 finished with value: 0.7850017143072006 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 18, 'min_samples_split': 102, 'min_samples_leaf': 44, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                           | 19/30 [18:11<11:10, 60.92s/it]

[I 2023-10-17 15:43:59,588] Trial 18 finished with value: 0.7848527712961231 and parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 29, 'min_samples_split': 147, 'min_samples_leaf': 41, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 20/30 [18:27<07:53, 47.36s/it]

[I 2023-10-17 15:44:15,352] Trial 19 finished with value: 0.7855543691465776 and parameters: {'n_estimators': 100, 'n_jobs': -1, 'random_state': 42, 'max_depth': 10, 'min_samples_split': 102, 'min_samples_leaf': 26, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 21/30 [19:56<08:58, 59.85s/it]

[I 2023-10-17 15:45:44,319] Trial 20 finished with value: 0.7852595876303765 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 17, 'min_samples_split': 122, 'min_samples_leaf': 50, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 22/30 [21:34<09:32, 71.52s/it]

[I 2023-10-17 15:47:23,064] Trial 21 finished with value: 0.7855928319861306 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 14, 'min_samples_split': 20, 'min_samples_leaf': 53, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 23/30 [23:17<09:25, 80.81s/it]

[I 2023-10-17 15:49:05,527] Trial 22 finished with value: 0.7850146822266214 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 24, 'min_samples_split': 39, 'min_samples_leaf': 59, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 24/30 [24:17<07:28, 74.73s/it]

[I 2023-10-17 15:50:06,097] Trial 23 finished with value: 0.7854921218774176 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 10, 'min_samples_split': 88, 'min_samples_leaf': 40, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 25/30 [25:45<06:33, 78.74s/it]

[I 2023-10-17 15:51:34,190] Trial 24 finished with value: 0.7854691635300302 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 15, 'min_samples_split': 34, 'min_samples_leaf': 54, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 26/30 [26:08<04:07, 61.82s/it]

[I 2023-10-17 15:51:56,540] Trial 25 finished with value: 0.7774706538564815 and parameters: {'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'max_depth': 4, 'min_samples_split': 14, 'min_samples_leaf': 53, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                | 27/30 [27:32<03:25, 68.51s/it]

[I 2023-10-17 15:53:20,649] Trial 26 finished with value: 0.7844544115082686 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 20, 'min_samples_split': 120, 'min_samples_leaf': 10, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 28/30 [29:05<02:31, 75.92s/it]

[I 2023-10-17 15:54:53,846] Trial 27 finished with value: 0.7845802907578572 and parameters: {'n_estimators': 600, 'n_jobs': -1, 'random_state': 42, 'max_depth': 26, 'min_samples_split': 50, 'min_samples_leaf': 42, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 29/30 [29:26<00:59, 59.32s/it]

[I 2023-10-17 15:55:14,451] Trial 28 finished with value: 0.7845210716755463 and parameters: {'n_estimators': 200, 'n_jobs': -1, 'random_state': 42, 'max_depth': 8, 'min_samples_split': 92, 'min_samples_leaf': 51, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.


Best trial: 15. Best value: 0.785717: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [30:47<00:00, 61.58s/it]

[I 2023-10-17 15:56:35,584] Trial 29 finished with value: 0.7841462613035993 and parameters: {'n_estimators': 500, 'n_jobs': -1, 'random_state': 42, 'max_depth': 20, 'min_samples_split': 67, 'min_samples_leaf': 28, 'criterion': 'gini', 'max_features': 1.0}. Best is trial 15 with value: 0.785717273334426.





In [7]:
study.best_params, study.best_value

({'n_estimators': 500,
  'n_jobs': -1,
  'random_state': 42,
  'max_depth': 12,
  'min_samples_split': 101,
  'min_samples_leaf': 47,
  'criterion': 'gini',
  'max_features': 1.0},
 0.785717273334426)

In [8]:
pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**study.best_params))
pipeline.fit(X, y)

In [9]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)