In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("../../data/train.csv")
df_train.drop(columns=["id"], inplace=True)
df_train["defects"].replace({True: 1, False: 0}, inplace=True)
X = df_train.drop(columns=["defects"])
y = df_train["defects"]

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import lightgbm as lgb

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [6]:
def objective(trial, _X, _y):
    X_train, X_test, y_train, y_test = train_test_split(
        _X, 
        _y, 
        train_size=0.8, 
        random_state=1
    )
    d_train = lgb.Dataset(X_train, label=y_train)
    
    param = {
        "objective": "binary",
        "metric": "ROC_auc",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
    }
    
    gbm = lgb.train(param, d_train)
    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    accuracy = roc_auc_score(y_test, pred_labels)
    
    return accuracy

In [7]:
from functools import partial

In [8]:
import optuna

In [9]:
study = optuna.create_study(direction="maximize")

[I 2023-10-14 08:03:14,777] A new study created in memory with name: no-name-dc9ef266-d8bc-4037-9118-d97eaac4ee83


In [10]:
objective_function = partial(objective, _X=X, _y=y)

In [11]:
study.optimize(objective_function, n_trials=1000, n_jobs=-1)

[I 2023-10-14 08:03:16,318] Trial 5 finished with value: 0.6646522550985932 and parameters: {'lambda_l1': 2.3440644714060762e-08, 'lambda_l2': 3.931601376774572e-05, 'num_leaves': 6, 'feature_fraction': 0.8830596616662695, 'bagging_fraction': 0.4154582288543228, 'bagging_freq': 3, 'min_child_samples': 50}. Best is trial 5 with value: 0.6646522550985932.
[I 2023-10-14 08:03:16,517] Trial 1 finished with value: 0.6655488017072535 and parameters: {'lambda_l1': 1.1295103791125665, 'lambda_l2': 0.4083512058628499, 'num_leaves': 8, 'feature_fraction': 0.9661378408099235, 'bagging_fraction': 0.7213562046605133, 'bagging_freq': 5, 'min_child_samples': 80}. Best is trial 1 with value: 0.6655488017072535.
[I 2023-10-14 08:03:16,806] Trial 13 finished with value: 0.6663169726443412 and parameters: {'lambda_l1': 1.3741022286495795e-05, 'lambda_l2': 3.0222268774502662e-06, 'num_leaves': 9, 'feature_fraction': 0.8713857240407138, 'bagging_fraction': 0.5840693760417287, 'bagging_freq': 6, 'min_child_

In [12]:
len(study.trials)

1000

In [13]:
study.best_trial

FrozenTrial(number=409, state=TrialState.COMPLETE, values=[0.6689659036721459], datetime_start=datetime.datetime(2023, 10, 14, 8, 4, 56, 290783), datetime_complete=datetime.datetime(2023, 10, 14, 8, 4, 59, 75009), params={'lambda_l1': 0.23534144048125305, 'lambda_l2': 1.8923172081845285e-08, 'num_leaves': 2, 'feature_fraction': 0.7718968020503838, 'bagging_fraction': 0.5297487428232918, 'bagging_freq': 7, 'min_child_samples': 47}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_freq': IntDistribution(high=7, log=False, low=1, step=1), 'min_child_samples': IntDistribution(high=100, log

In [14]:
study.best_params

{'lambda_l1': 0.23534144048125305,
 'lambda_l2': 1.8923172081845285e-08,
 'num_leaves': 2,
 'feature_fraction': 0.7718968020503838,
 'bagging_fraction': 0.5297487428232918,
 'bagging_freq': 7,
 'min_child_samples': 47}

In [15]:
study.best_params

{'lambda_l1': 0.23534144048125305,
 'lambda_l2': 1.8923172081845285e-08,
 'num_leaves': 2,
 'feature_fraction': 0.7718968020503838,
 'bagging_fraction': 0.5297487428232918,
 'bagging_freq': 7,
 'min_child_samples': 47}

In [16]:
d_train = lgb.Dataset(data=X, label=y)

In [17]:
model = lgb.train(params=study.best_params, train_set=d_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3585
[LightGBM] [Info] Number of data points in the train set: 101763, number of used features: 21
[LightGBM] [Info] Start training from score 0.226644


In [18]:
df_test = pd.read_csv("../../data/test.csv")
id = df_test["id"]
df_test.drop(columns=["id"], inplace=True)

In [19]:
result = model.predict(df_test)

In [20]:
df_submission = pd.DataFrame(data={'id': id ,'defects': result})
df_submission.to_csv('submission.csv', index=False)

In [21]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score