In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("../../data/train.csv")
df_train.drop(columns=["id"], inplace=True)
df_train["defects"].replace({True: 1, False: 0}, inplace=True)
X = df_train.drop(columns=["defects"])
y = df_train["defects"]

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.ensemble import AdaBoostClassifier

{"lambda_l1": 7.764448658123637,
 "lambda_l2": 0.044259970070964404,
 "num_leaves": 138,
 "feature_fraction": 0.5200523998976143,
 "bagging_fraction": 0.6350022368129746,
 "bagging_freq": 1,
 "min_child_samples": 79}

In [5]:
from sklearn.metrics import roc_auc_score

In [6]:
def objective(trial, _X, _y):
    X_train, X_test, y_train, y_test = train_test_split(_X, _y, train_size=0.8, random_state=1)
    
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 2, 200), 
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 10.0)
    }
    
    model = AdaBoostClassifier(**param).fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_labels = np.rint(preds)
    return roc_auc_score(y_test, pred_labels)

In [7]:
from functools import partial

In [8]:
import optuna

In [9]:
study = optuna.create_study(direction="maximize")

[I 2023-10-14 08:18:00,021] A new study created in memory with name: no-name-f4d671b2-aff4-4d02-97b6-bab0a9e88782


In [10]:
objective_function = partial(objective, _X=X, _y=y)

In [11]:
study.optimize(objective_function, n_trials=1000, n_jobs=-1)

[I 2023-10-14 08:18:46,117] Trial 10 finished with value: 0.29249090410237943 and parameters: {'n_estimators': 2, 'learning_rate': 3.9781622453503527}. Best is trial 10 with value: 0.29249090410237943.
[I 2023-10-14 08:18:46,159] Trial 5 finished with value: 0.7075090958976206 and parameters: {'n_estimators': 2, 'learning_rate': 1.7104680346906391}. Best is trial 5 with value: 0.7075090958976206.
[I 2023-10-14 08:18:50,255] Trial 1 finished with value: 0.7075090958976206 and parameters: {'n_estimators': 23, 'learning_rate': 2.8083176411914383}. Best is trial 5 with value: 0.7075090958976206.
[I 2023-10-14 08:18:54,900] Trial 13 finished with value: 0.7075090958976206 and parameters: {'n_estimators': 46, 'learning_rate': 5.96643476278888}. Best is trial 5 with value: 0.7075090958976206.
[I 2023-10-14 08:18:57,059] Trial 9 finished with value: 0.7075090958976206 and parameters: {'n_estimators': 59, 'learning_rate': 6.358574051815053}. Best is trial 5 with value: 0.7075090958976206.
[I 20

In [13]:
study.best_trial

FrozenTrial(number=5, state=TrialState.COMPLETE, values=[0.7075090958976206], datetime_start=datetime.datetime(2023, 10, 14, 8, 18, 45, 364467), datetime_complete=datetime.datetime(2023, 10, 14, 8, 18, 46, 158472), params={'n_estimators': 2, 'learning_rate': 1.7104680346906391}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=2, step=1), 'learning_rate': FloatDistribution(high=10.0, log=False, low=1e-08, step=None)}, trial_id=5, value=None)

In [14]:
study.best_params

{'n_estimators': 2, 'learning_rate': 1.7104680346906391}

In [15]:
model = AdaBoostClassifier(**study.best_params)

In [17]:
model.fit(X=X, y=y)

In [18]:
df_test = pd.read_csv("../../data/test.csv")
id = df_test["id"]
df_test.drop(columns=["id"], inplace=True)

In [21]:
result = model.predict_proba(df_test)

In [24]:
df_submission = pd.DataFrame(data={'id': id ,'defects': result[:, 1]})
df_submission.to_csv('submission.csv', index=False)