In [None]:
import os
import numpy as np
from pathlib import Path
import pandas as pd
import optuna
import xgboost as xgb
from functools import partial
import lightgbm as lgbm
import catboost as cb
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, make_union, FeatureUnion, Pipeline
from sklearn.preprocessing import   StandardScaler,RobustScaler, OneHotEncoder, FunctionTransformer, PowerTransformer, QuantileTransformer, \
                                    PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, RFE, RFECV, SelectFromModel
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
class Config:
    CONTEST_TAG = 'playground-series-s3e24'
    ROOT_DIR = '../'
    FILE_NAME = 'lgbm-gbdt-v1'
    
    def __init__(self):
        self.iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
        self.seed = 42
        self.n_jobs = 8
        self.splits = 5
        self.target_col = 'smoking'
        self.index_col = 'id'
        self.optuna_trials = 30
        if self.iskaggle:
            self.train_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/train.csv')
            self.test_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/test.csv')
            self.Categoricalubmission_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/sample_submission.csv')
        else:
            self.train_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/train.csv')
            self.test_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/test.csv')
            self.submission_path = Path(f'{self.ROOT_DIR}/input/{self.CONTEST_TAG}/sample_submission.csv')

        self.default_rf_params =  {   
            'criterion': 'gini',
            'n_jobs': self.n_jobs, 
            'random_state': self.seed
        }
        self.default_cb_params =  {   
            'random_state': self.seed,
            'eval_metric': 'AUC',
            'loss_function': 'Logloss',
            'verbose': False
        }
        self.default_lgbm_gbdt_params = {
            'n_estimators': 10000,
            'random_state': self.seed,
            'n_jobs': self.n_jobs,
            'objective': 'binary',
            'verbose': -1,
            'boosting_type': 'gbdt',
            'bagging_freq': 1
        }

CFG = Config()
CLF = lgbm.LGBMClassifier

default_params = CFG.default_lgbm_gbdt_params

In [None]:
train_df = pd.read_csv(CFG.train_path, index_col=CFG.index_col)
test_df = pd.read_csv(CFG.test_path, index_col=CFG.index_col)

In [None]:
cat_cols = ['Urine protein', 'hearing(left)', 'hearing(right)', 'dental caries']
num_cols = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp', ]

In [None]:
X_train  = train_df.drop(CFG.target_col, axis=1)
y_train  = train_df[CFG.target_col]

X_test = test_df

In [None]:
def convert_to_categorial(X):
    new_X = X.copy()
    for col in X.columns:
        new_X[col] = pd.Categorical(X[col])
    return new_X


transformers = [
    ('robust', RobustScaler()),
    ('log', FunctionTransformer(func=np.log1p, feature_names_out=lambda x, features: [f'log_{feature}' for feature in features])), 
    ('power', PowerTransformer()), 
    ('quant', QuantileTransformer()), 
    ('poly', PolynomialFeatures())
]

numerical_transformer = Pipeline(steps=[
       ('feature_union', FeatureUnion(transformer_list=transformers))
])

# categorical_transformer = Pipeline(steps=[
#     ('1hot_encoder', OneHotEncoder())
# ])

categorical_transformer = Pipeline(steps=[
    ('pd_categorical', FunctionTransformer(func=convert_to_categorial))
])

preprocessor  = ColumnTransformer([
    ('cat', categorical_transformer, cat_cols),
    ('num', numerical_transformer, num_cols)
], n_jobs=-1)


feature_selection = Pipeline(steps=[
    ('select_from_model', SelectFromModel(CLF(**default_params)))
])

clf = Pipeline(steps = [
    ('rf', CLF(**default_params))
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
#     ('classifier', clf)
])


# Tranform Data
X_train = pipeline.fit_transform(X_train, y_train)
X_test = pipeline.transform(X_test)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

In [8]:
def objective(trial: optuna.Trial, X, y):
    optuna_params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }
    params = {**default_params, **optuna_params}
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_tr, X_te = X[train_idx], X[test_idx]
        y_tr, y_te = y[train_idx], y[test_idx]

        model = CLF(**params)
        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_te, y_te)],
            eval_metric="auc",
            callbacks=[
                lgbm.early_stopping(100),
                optuna.integration.LightGBMPruningCallback(trial, "auc")
            ]
        )
        preds = model.predict_proba(X_te)[:, 1]
        cv_scores[idx] = roc_auc_score(y_te, preds)
    return np.mean(cv_scores)

study = optuna.create_study(direction='maximize')
study.optimize(partial(objective, X=X_train, y=y_train), n_trials=CFG.optuna_trials, show_progress_bar=True, n_jobs=CFG.n_jobs)



Early stopping, best iteration is:
[6373]	valid_0's auc: 0.864738	valid_0's binary_logloss: 0.446127




Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[8109]	valid_0's auc: 0.856284	valid_0's binary_logloss: 0.456585
[I 2023-11-09 19:12:14,318] Trial 2 finished with value: 0.8599538830015405 and parameters: {'learning_rate': 0.026557121954838586, 'num_leaves': 280, 'max_depth': 7, 'min_data_in_leaf': 6000, 'max_bin': 209, 'lambda_l1': 100, 'lambda_l2': 90, 'min_gain_to_split': 0.7685838995507621, 'bagging_fraction': 0.7, 'feature_fraction': 0.6000000000000001}. Best is trial 9 with value: 0.8633359382973204.


In [9]:
params = {**default_params, **study.best_params}
print(params)

{'n_estimators': 10000, 'random_state': 42, 'n_jobs': 8, 'objective': 'binary', 'verbose': -1, 'boosting_type': 'gbdt', 'bagging_freq': 1, 'learning_rate': 0.031115894639831713, 'num_leaves': 1000, 'max_depth': 9, 'min_data_in_leaf': 2900, 'max_bin': 256, 'lambda_l1': 0, 'lambda_l2': 0, 'min_gain_to_split': 2.1168688361634964, 'bagging_fraction': 0.6000000000000001, 'feature_fraction': 0.9}


In [10]:
pipeline = CLF(**params)
# pipeline.named_steps['classifier'].set_params(**cb_params)
pipeline.fit(X_train, y_train)

In [11]:
predictions = pipeline.predict_proba(X_test)[:, 1]
sample_submission = pd.read_csv(CFG.submission_path, index_col=CFG.index_col)
sample_submission[CFG.target_col] = predictions
sample_submission.to_csv('submission.csv')

In [12]:
!kaggle competitions submit -c {CFG.CONTEST_TAG} -f submission.csv -m "LGBM gdbt v3"

100%|██████████████████████████████████████| 2.69M/2.69M [00:01<00:00, 2.23MB/s]
Successfully submitted to Binary Prediction of Smoker Status using Bio-Signals