<a href="https://colab.research.google.com/github/Tiabet/DACON_WebClick/blob/main/WebClick_LGBM_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [24]:
pip install -U lightgbm

Collecting lightgbm
  Downloading lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.1.0
    Uninstalling lightgbm-4.1.0:
      Successfully uninstalled lightgbm-4.1.0
Successfully installed lightgbm-4.3.0


In [35]:
import pandas as pd
import os
import random
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [33]:
train = pd.read_parquet('/content/drive/MyDrive/data/train_100000.parquet')
test = pd.read_parquet('/content/drive/MyDrive/data/test_100000.parquet')

In [None]:
train['F01'].value_counts()

In [None]:
test['F01'].value_counts()

In [36]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [37]:
train = train.groupby('Click').apply(lambda x: x.sample(min(len(x), 5569860)))

In [38]:
train_x = train.drop('Click', axis = 1)
train_y = train['Click']

In [39]:
del train

In [40]:
# Label encode categorical columns
def label_encode_columns(train_df, test_df):
    le = LabelEncoder()
    for column in train_df.columns:
        if train_df[column].dtype == 'object' or isinstance(train_df[column].dtype, pd.CategoricalDtype):
            # Fit the LabelEncoder on the combined data to ensure consistency
            combined_data = pd.concat([train_df[column], test_df[column]], axis=0).astype(str)
            le.fit(combined_data)
            train_df[column] = le.transform(train_df[column].astype(str))
            test_df[column] = le.transform(test_df[column].astype(str))
    return train_df, test_df

# Apply label encoding to train_x and test_x
train_x, test_x = label_encode_columns(train_x, test)

In [41]:
len(train_x)==len(train_y)

True

In [42]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

Optuna 하이퍼파라미터 튜닝

LGBMClassifier(colsample_bytree=0.6829731847990743,
               learning_rate=0.041926565437435385, max_bin=1023,
               min_child_samples=14, n_estimators=6159, n_jobs=-1,
               num_leaves=225, reg_alpha=0.00706357318094864,
               reg_lambda=0.4980866507512539, verbose=-1)

In [57]:
import optuna
import lightgbm as lgb
from lightgbm import early_stopping
from sklearn import metrics

def objective(trial):
    param = {
        'objective': 'binary',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_jobs':-1,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_bin': 1023,
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'n_estimators': trial.suggest_int('n_estimators', 3000, 8000),
        'num_leaves': trial.suggest_int('num_leaves', 100, 300),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 1.0),
    }

    clf = lgb.LGBMClassifier(**param)
    # Include eval_set and early_stopping_rounds
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(stopping_rounds=10)])
    preds = clf.predict_proba(X_val)[:, 1]
    auc = metrics.roc_auc_score(y_val, preds)
    return auc


In [58]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)


[I 2024-05-27 12:25:39,669] A new study created in memory with name: no-name-a3381fe2-a8e4-4431-bfef-4c4f391621b6


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1168]	valid_0's binary_logloss: 0.611641


[I 2024-05-27 12:31:01,293] Trial 0 finished with value: 0.7238284782938716 and parameters: {'colsample_bytree': 0.7542950498068064, 'learning_rate': 0.05980373813541492, 'min_child_samples': 26, 'n_estimators': 6366, 'num_leaves': 138, 'reg_alpha': 0.006934983447654428, 'reg_lambda': 0.7040424711264828}. Best is trial 0 with value: 0.7238284782938716.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[366]	valid_0's binary_logloss: 0.612234


[I 2024-05-27 12:33:10,608] Trial 1 finished with value: 0.7230682874533236 and parameters: {'colsample_bytree': 0.534451000344625, 'learning_rate': 0.09573555229240897, 'min_child_samples': 17, 'n_estimators': 7890, 'num_leaves': 243, 'reg_alpha': 0.06924498769351944, 'reg_lambda': 0.44878418658848407}. Best is trial 0 with value: 0.7238284782938716.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[2206]	valid_0's binary_logloss: 0.611513


[I 2024-05-27 12:43:14,904] Trial 2 finished with value: 0.724024724775642 and parameters: {'colsample_bytree': 0.7629020005212874, 'learning_rate': 0.030519849738200946, 'min_child_samples': 9, 'n_estimators': 5959, 'num_leaves': 155, 'reg_alpha': 0.06930045775882353, 'reg_lambda': 0.9045824073499228}. Best is trial 2 with value: 0.724024724775642.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1606]	valid_0's binary_logloss: 0.611803


[I 2024-05-27 12:50:15,349] Trial 3 finished with value: 0.7236515128661177 and parameters: {'colsample_bytree': 0.852711648698804, 'learning_rate': 0.04578916459688215, 'min_child_samples': 10, 'n_estimators': 6770, 'num_leaves': 123, 'reg_alpha': 0.049996530538146434, 'reg_lambda': 0.2198878813470332}. Best is trial 2 with value: 0.724024724775642.


Training until validation scores don't improve for 10 rounds


[W 2024-05-27 12:51:49,555] Trial 4 failed with parameters: {'colsample_bytree': 0.6024471871948847, 'learning_rate': 0.05097697750794569, 'min_child_samples': 27, 'n_estimators': 4799, 'num_leaves': 261, 'reg_alpha': 0.04049687105507829, 'reg_lambda': 0.15990929535025375} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-57-c996bde8f01f>", line 24, in objective
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[early_stopping(stopping_rounds=10)])
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 1142, in fit
    _y = self._le.transform(y)
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/sklearn.py", line 842, in fit
    elif isinstance(collection, list):
  File "/usr/local/lib/python3.10/dist-packages/lightgbm/engine.py", line 276, in train
  

KeyboardInterrupt: 

In [59]:
!pip install flaml

Collecting flaml
  Downloading FLAML-2.1.2-py3-none-any.whl (296 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m184.3/296.7 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: flaml
Successfully installed flaml-2.1.2


In [60]:
from flaml import AutoML

In [61]:
# Initialize AutoML
automl = AutoML()

automl_settings = {
    "time_budget": 7200,  # Total time budget in seconds
    "metric": 'roc_auc',  # Evaluation metric
    "task": 'classification',  # Task type
    "log_file_name": 'automl.log',  # Log file
    "estimator_list": ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'],  # List of estimators to use
    "estimator_list": ['lgbm'],
    "eval_method": "holdout",  # Use holdout validation method
    # "split_ratio": 0.2,  # Ratio of data to be used as validation set
    "early_stop": 10
}


In [None]:
# Fit the model
automl.fit(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings)

[flaml.automl.logger: 05-27 13:56:02] {1680} INFO - task = classification
[flaml.automl.logger: 05-27 13:56:02] {1688} INFO - Data split method: stratified
[flaml.automl.logger: 05-27 13:56:02] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 05-27 13:56:09] {1789} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 05-27 13:56:09] {1901} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl.logger: 05-27 13:56:09] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 05-27 13:56:10] {2345} INFO - Estimated sufficient time budget=9194750s. Estimated necessary time budget=9195s.
[flaml.automl.logger: 05-27 13:56:10] {2392} INFO -  at 89.1s,	estimator lgbm's best error=0.3827,	best estimator lgbm's best error=0.3827
[flaml.automl.logger: 05-27 13:56:10] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 05-27 13:56:11] {2392} INFO -  at 90.2s,	estimator lgbm's best error=0.3827,	best estimator lgbm's best error=0.382

In [None]:
sample_submission = pd.read_csv('drive/MyDrive/data/sample_submission.csv')

# Predict probabilities on the test set
y_pred_test_proba = automl.predict_proba(test_x)[:, 1]

# Output the probabilities for test set
print(y_pred_test_proba)

In [None]:
sample_submission['Click'] = y_pred_test_proba
sample_submission

In [None]:
sample_submission.to_csv("drive/MyDrive/data/automl_lgbm_prediction_v2.csv",index = False)