In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
import optuna

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('playground-series-s3e23')
    orig_path = Path('software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

seed = 42
splits = 5
np.random.seed(seed)

In [5]:
partial_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer())

In [6]:
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600, 100),
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]),
        'random_state': trial.suggest_categorical('random_state', [seed]),
        'max_depth': trial.suggest_int('max_depth', 4, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60),
        'criterion': trial.suggest_categorical("criterion", ["gini"]),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2"]),
    }

    pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**params))
    scores = cross_val_score(pipeline, X, y, cv=splits, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2023-10-15 00:35:56,925] A new study created in memory with name: no-name-9fbc0ecd-3fe0-4d8a-8a40-12d7dc23dadc


  0%|          | 0/30 [00:00<?, ?it/s]

[W 2023-10-15 00:35:56,942] Trial 0 failed with parameters: {'n_estimators': 300, 'n_jobs': -1, 'random_state': 42, 'max_depth': 48, 'min_samples_split': 65, 'min_samples_leaf': 31, 'criterion': 'gini', 'max_features': 'log2'} because of the following error: NameError("name 'split' is not defined").
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_32/2101311118.py", line 16, in objective
    scores = cross_val_score(pipeline, X, y, cv=split, scoring='roc_auc', n_jobs=-1)
NameError: name 'split' is not defined
[W 2023-10-15 00:35:56,943] Trial 0 failed with value None.


NameError: name 'split' is not defined

In [None]:
study.best_params, study.best_value

In [None]:
pipeline = make_pipeline(partial_pipeline, ExtraTreesClassifier(**study.best_params))
pipeline.fit(X, y)

In [None]:
submission = test.copy()
submission['defects'] = pipeline.predict_proba(submission)[:, 1]
submission.defects.to_csv('extra_tree_submission_v4.csv')