In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("../../data/train.csv")
df_train.drop(columns=["id"], inplace=True)
df_train["defects"].replace({True: 1, False: 0}, inplace=True)
X = df_train.drop(columns=["defects"])
y = df_train["defects"]

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from xgboost import XGBClassifier

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [6]:
from functools import partial

In [7]:
import optuna

In [8]:
def objective(trial, _X, _y):
    X_train, X_test, y_train, y_test = train_test_split(
        _X, 
        _y, 
        train_size=0.8, 
        random_state=1
    )
    
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0),
        "gamma": trial.suggest_int("gamma", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 500),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 1000),
        "max_delta_step": trial.suggest_int("max_delta_step", 2, 100),
        "subsample": trial.suggest_float("subsample", 1e-8, 1.0),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "max_leaves": trial.suggest_int("max_leaves", 2, 1000),
        "eval_metric": trial.suggest_categorical("eval_metric", ["auc"]), 
        "feature_selector": trial.suggest_categorical("feature_selector", ["cyclic", "shuffle", "random", "greedy", "thrifty"]), 
        "tree_method": trial.suggest_categorical("tree_method", ["exact", "approx", "hist"]), 
        "n_estimators": trial.suggest_int("n_estimators", 2, 256), 
        "n_jobs": -1
    }
    
    xgb = XGBClassifier(**param).fit(X_train, y_train)
    preds = xgb.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = roc_auc_score(y_test, pred_labels)
    
    return accuracy

In [9]:
study = optuna.create_study(direction="maximize")

[I 2023-10-14 09:33:53,872] A new study created in memory with name: no-name-e1413257-2e18-45c5-8bcd-0c8c1666e9a5


In [10]:
objective_function = partial(objective, _X=X, _y=y)


        
        "colsample_bytree": trial.suggest_float("colsample_bytree", 1e-8, 10.0, log=True),
        "objective": "binary",
        "tree_method": "hist",
        "early_stopping_rounds": 256,

In [11]:
study.optimize(objective_function, n_trials=1000, n_jobs=-1)

  if is_sparse(dtype):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_sparse(dtype):
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_sparse(dtype):
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_cate

In [12]:
len(study.trials)

1000

In [13]:
study.best_trial

FrozenTrial(number=711, state=TrialState.COMPLETE, values=[0.7193485827129803], datetime_start=datetime.datetime(2023, 10, 14, 9, 40, 55, 689558), datetime_complete=datetime.datetime(2023, 10, 14, 9, 41, 0, 541564), params={'learning_rate': 0.9010174049691081, 'gamma': 849, 'max_depth': 296, 'min_child_weight': 409, 'max_delta_step': 14, 'subsample': 0.0987448171749641, 'lambda': 1.1056176464420966e-07, 'alpha': 3.803997821347391, 'max_leaves': 487, 'eval_metric': 'auc', 'feature_selector': 'shuffle', 'tree_method': 'hist', 'n_estimators': 141}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=False, low=1e-08, step=None), 'gamma': IntDistribution(high=1000, log=False, low=1, step=1), 'max_depth': IntDistribution(high=500, log=False, low=2, step=1), 'min_child_weight': IntDistribution(high=1000, log=False, low=2, step=1), 'max_delta_step': IntDistribution(high=100, log=False, low=2, step=1), 'subsample': FloatDistri

In [14]:
study.best_params

{'learning_rate': 0.9010174049691081,
 'gamma': 849,
 'max_depth': 296,
 'min_child_weight': 409,
 'max_delta_step': 14,
 'subsample': 0.0987448171749641,
 'lambda': 1.1056176464420966e-07,
 'alpha': 3.803997821347391,
 'max_leaves': 487,
 'eval_metric': 'auc',
 'feature_selector': 'shuffle',
 'tree_method': 'hist',
 'n_estimators': 141}

In [15]:
model = XGBClassifier(**study.best_params).fit(X, y)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Parameters: { "feature_selector" } are not used.



In [16]:
df_test = pd.read_csv("../../data/test.csv")
id = df_test["id"]
df_test.drop(columns=["id"], inplace=True)

In [17]:
result = model.predict_proba(df_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [18]:
df_submission = pd.DataFrame(data={'id': id ,'defects': result[:, 1]})
df_submission.to_csv('submission.csv', index=False)