In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
import optuna

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
train = pd.read_csv(r'./input/playground-series-s3e23/train.csv', index_col = 'id')
test = pd.read_csv(r'./input/playground-series-s3e23/test.csv', index_col = 'id')
orig_train = pd.read_csv(r'./input/software-defect-prediction/jm1.csv')

In [4]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [5]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

seed = 42
splits = 5
skf = StratifiedKFold(n_splits = splits, random_state = seed, shuffle = True)
np.random.seed(seed)

In [6]:
log_transformer = FunctionTransformer(func=np.log1p, validate=False)

In [7]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'n_jobs': -1,
        'random_state': seed,
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'criterion': trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2"]),
    }

    pipeline = make_pipeline(SimpleImputer(), log_transformer, ExtraTreesClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=3, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15, show_progress_bar=True)

[I 2023-10-12 16:51:34,190] A new study created in memory with name: no-name-af626da0-b2da-4284-8f4b-2d7539ec5e57
Best trial: 0. Best value: 0.775981:   7%|██████████▉                                                                                                                                                         | 1/15 [00:02<00:41,  2.93s/it]

[I 2023-10-12 16:51:37,122] Trial 0 finished with value: 0.7759806654108612 and parameters: {'n_estimators': 18, 'max_depth': 24, 'min_samples_split': 30, 'min_samples_leaf': 9, 'criterion': 'log_loss', 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7759806654108612.


Best trial: 1. Best value: 0.777835:  13%|█████████████████████▊                                                                                                                                              | 2/15 [00:10<01:16,  5.89s/it]

[I 2023-10-12 16:51:45,089] Trial 1 finished with value: 0.7778353630301144 and parameters: {'n_estimators': 137, 'max_depth': 23, 'min_samples_split': 31, 'min_samples_leaf': 24, 'criterion': 'gini', 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7778353630301144.


Best trial: 2. Best value: 0.777879:  20%|████████████████████████████████▊                                                                                                                                   | 3/15 [00:17<01:16,  6.37s/it]

[I 2023-10-12 16:51:52,031] Trial 2 finished with value: 0.7778792612121787 and parameters: {'n_estimators': 124, 'max_depth': 17, 'min_samples_split': 21, 'min_samples_leaf': 22, 'criterion': 'entropy', 'max_features': 'log2'}. Best is trial 2 with value: 0.7778792612121787.


Best trial: 2. Best value: 0.777879:  27%|███████████████████████████████████████████▋                                                                                                                        | 4/15 [00:28<01:28,  8.05s/it]

[I 2023-10-12 16:52:02,646] Trial 3 finished with value: 0.7695122175818098 and parameters: {'n_estimators': 481, 'max_depth': 4, 'min_samples_split': 119, 'min_samples_leaf': 30, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 2 with value: 0.7778792612121787.


Best trial: 2. Best value: 0.777879:  33%|██████████████████████████████████████████████████████▋                                                                                                             | 5/15 [00:30<00:58,  5.85s/it]

[I 2023-10-12 16:52:04,594] Trial 4 finished with value: 0.7769766490729174 and parameters: {'n_estimators': 36, 'max_depth': 20, 'min_samples_split': 21, 'min_samples_leaf': 38, 'criterion': 'entropy', 'max_features': 'log2'}. Best is trial 2 with value: 0.7778792612121787.


Best trial: 2. Best value: 0.777879:  40%|█████████████████████████████████████████████████████████████████▌                                                                                                  | 6/15 [00:46<01:24,  9.37s/it]

[I 2023-10-12 16:52:20,788] Trial 5 finished with value: 0.7777931117814975 and parameters: {'n_estimators': 307, 'max_depth': 41, 'min_samples_split': 12, 'min_samples_leaf': 24, 'criterion': 'log_loss', 'max_features': 'sqrt'}. Best is trial 2 with value: 0.7778792612121787.


Best trial: 6. Best value: 0.779226:  47%|████████████████████████████████████████████████████████████████████████████▌                                                                                       | 7/15 [01:05<01:39, 12.38s/it]

[I 2023-10-12 16:52:39,366] Trial 6 finished with value: 0.7792264077471254 and parameters: {'n_estimators': 315, 'max_depth': 21, 'min_samples_split': 44, 'min_samples_leaf': 7, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 6 with value: 0.7792264077471254.


Best trial: 6. Best value: 0.779226:  53%|███████████████████████████████████████████████████████████████████████████████████████▍                                                                            | 8/15 [01:06<01:01,  8.77s/it]

[I 2023-10-12 16:52:40,411] Trial 7 finished with value: 0.7767766926041785 and parameters: {'n_estimators': 15, 'max_depth': 35, 'min_samples_split': 114, 'min_samples_leaf': 22, 'criterion': 'entropy', 'max_features': 'sqrt'}. Best is trial 6 with value: 0.7792264077471254.


Best trial: 6. Best value: 0.779226:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                 | 9/15 [01:21<01:04, 10.74s/it]

[I 2023-10-12 16:52:55,474] Trial 8 finished with value: 0.7759903868103123 and parameters: {'n_estimators': 401, 'max_depth': 10, 'min_samples_split': 135, 'min_samples_leaf': 17, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 6 with value: 0.7792264077471254.


Best trial: 6. Best value: 0.779226:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                      | 10/15 [01:27<00:46,  9.35s/it]

[I 2023-10-12 16:53:01,732] Trial 9 finished with value: 0.7788808703384685 and parameters: {'n_estimators': 104, 'max_depth': 32, 'min_samples_split': 65, 'min_samples_leaf': 9, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 6 with value: 0.7792264077471254.


Best trial: 6. Best value: 0.779226:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 11/15 [01:39<00:40, 10.02s/it]

[I 2023-10-12 16:53:13,251] Trial 10 finished with value: 0.7769056576660063 and parameters: {'n_estimators': 256, 'max_depth': 45, 'min_samples_split': 68, 'min_samples_leaf': 56, 'criterion': 'gini', 'max_features': 'log2'}. Best is trial 6 with value: 0.7792264077471254.


Best trial: 11. Best value: 0.7815:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 12/15 [01:51<00:32, 10.80s/it]

[I 2023-10-12 16:53:25,843] Trial 11 finished with value: 0.7815004068611028 and parameters: {'n_estimators': 176, 'max_depth': 34, 'min_samples_split': 69, 'min_samples_leaf': 1, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 11 with value: 0.7815004068611028.


Best trial: 12. Best value: 0.781881:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 13/15 [02:13<00:28, 14.11s/it]

[I 2023-10-12 16:53:47,555] Trial 12 finished with value: 0.7818807576268298 and parameters: {'n_estimators': 329, 'max_depth': 34, 'min_samples_split': 87, 'min_samples_leaf': 1, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 12 with value: 0.7818807576268298.


Best trial: 12. Best value: 0.781881:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 14/15 [02:24<00:13, 13.23s/it]

[I 2023-10-12 16:53:58,760] Trial 13 finished with value: 0.779750927329914 and parameters: {'n_estimators': 186, 'max_depth': 50, 'min_samples_split': 92, 'min_samples_leaf': 4, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 12 with value: 0.7818807576268298.


Best trial: 12. Best value: 0.781881: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [02:39<00:00, 10.64s/it]

[I 2023-10-12 16:54:13,746] Trial 14 finished with value: 0.7818459509140733 and parameters: {'n_estimators': 228, 'max_depth': 32, 'min_samples_split': 90, 'min_samples_leaf': 1, 'criterion': 'log_loss', 'max_features': 'log2'}. Best is trial 12 with value: 0.7818807576268298.





In [10]:
study.best_params, study.best_value

({'n_estimators': 329,
  'max_depth': 34,
  'min_samples_split': 87,
  'min_samples_leaf': 1,
  'criterion': 'log_loss',
  'max_features': 'log2'},
 0.7818807576268298)

In [9]:
pipeline = make_pipeline(SimpleImputer(), log_transformer, ExtraTreesClassifier(**study.best_params))
pipeline.fit(X, y)

In [11]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv('extra_tree_submission.csv')