## Train baseline model

Create a quick and dirty model without data preparation just to see what we will get. We will use LightGBM for baseline model. Tree-based models free us from feature preparation and scaling. We will also use optuna to optimize hyperparameters.

In [48]:
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.metrics import accuracy_score

In [49]:
RANDOM_SEED = 42
TARGET_KEY = "target"

# setup db access
db_user = "postgres-user"
db_password = r"..."
# db_host="2.tcp.eu.ngrok.io:10902"
db_host = "10.0.1.56:5432"
db_name = "optuna_nn_project"
STORAGE = (
    # f"sqlite:///optuna_studies.db"
    f"postgresql://{db_user}:{db_password}@{db_host}/{db_name}"
)

In [50]:
df_train_fe = pd.read_parquet("./data/train_fe")  # .sample(n=30000)
df_test_fe = pd.read_parquet("./data/test_fe")  # .sample(n=30000)

df_train_fe.info(max_cols=900)

<class 'pandas.core.frame.DataFrame'>
Index: 146953 entries, 1525928 to 132580172
Data columns (total 816 columns):
 #    Column                            Non-Null Count   Dtype  
---   ------                            --------------   -----  
 0    Ama_rchrgmnt_sum_max_mnt1         146914 non-null  float16
 1    content_clc_mea_mnt1              146914 non-null  float16
 2    content_cnt_max_mnt1              146914 non-null  float16
 3    voice_out_short_part_max_mnt1     146916 non-null  float16
 4    voice_mts_in_nrest_part_std_mnt1  146916 non-null  float16
 5    num_act_days_max_mnt1             146916 non-null  float16
 6    sms_roam_clc_min_mnt1             146914 non-null  float16
 7    voice_in_cmpttrs_avg_durmin_mnt1  146916 non-null  float16
 8    com_num_part_mea_mnt1             146914 non-null  float16
 9    pay_avg_mea_mnt1                  146916 non-null  float16
 10   voice_out_tar_dur_std_mnt1        146902 non-null  float16
 11   voice_out_tar_dur_min_mnt1       

In [51]:
X_train = df_train_fe.drop(columns=[TARGET_KEY])
# lgb requires class to be zero-based
y_train = df_train_fe[TARGET_KEY] - 1
X_test = df_test_fe.drop(columns=[TARGET_KEY])
# lgb requires class to be zero-based
y_test = df_test_fe[TARGET_KEY] - 1

In [52]:
def optimize_with_max_trials(
    study: "optuna.study.Study",
    objective: callable,
    n_trials: int,
    states: tuple[optuna.trial.TrialState, ...] = (optuna.trial.TrialState.COMPLETE,),
    callbacks=[],
    # rest optuna options
    **kwargs,
):
    """
    By default the n_trials specifies trials count per worker.
    So if you use multiple processes you will have some issues:
    - you should know exactly how much workers will it be to pick correct value
    - if some of workers will reach it's n_trials faster, you'll get an idle
      worker which could do some work otherwise
    - if you'll restart the process — trial count will start from scratch without
      accounting for earlier finished trials

    Source: https://github.com/optuna/optuna/issues/1883#issuecomment-702688136
    """

    trials = study.get_trials(deepcopy=False, states=states)
    n_complete = len(trials)

    if n_complete >= n_trials:
        return

    callbacks.append(optuna.study.MaxTrialsCallback(n_trials))

    study.optimize(
        objective,
        n_trials=n_trials,
        callbacks=callbacks,
        **kwargs,
    )


def run_optuna(
    study_name: str,
    objective: callable,
    storage: str = None,
    n_trials=100,
    direction="minimize",
    seed: int = None,
):
    if not storage:
        storage = f"sqlite:///optuna_studies.db"

    # Create a study object and optimize the objective function
    study = optuna.create_study(
        study_name=study_name,
        direction=direction,
        sampler=optuna.samplers.TPESampler(seed=seed),
        storage=optuna.storages.RDBStorage(
            storage,
            {
                # handle disconnections on google colab
                # https://github.com/optuna/optuna/issues/622
                "pool_pre_ping": True
            },
        ),
        load_if_exists=True,
    )
    optimize_with_max_trials(study, objective, n_trials=n_trials)

    # Best hyperparameters found
    print("Best hyperparameters: ", study.best_params)

    # Best score achieved
    print("Best score: ", study.best_value)

    return study

In [53]:
def objective(
    trial,
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_test: pd.DataFrame,
    seed: int = None,
):
    param = {
        # # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        # "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        # "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        # "num_leaves": trial.suggest_int("num_leaves", 2, 50, step=2),
        # "max_depth": trial.suggest_int("max_depth", 1, 5),
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100, step=5),
        # "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        # "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        # "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        # "bagging_fraction": trial.suggest_float(
        #     "bagging_fraction", 0.2, 0.90, step=0.1
        # ),
        # "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        # "feature_fraction": trial.suggest_float(
        #     "feature_fraction", 0.2, 0.90, step=0.1
        # ),
        #     'objective': 'multiclass',
        # 'metric': 'multi_logloss',
        # 'num_class': 3,
        # 'boosting_type': 'gbdt',
        # 'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        # 'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        # 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        # 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        # 'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        # 'random_state': 42,
        # 'verbosity': -1
        "objective": "multiclass",
        "num_class": 5,
        "metric": "multi_logloss",
        "verbosity": -1,
        "eta": trial.suggest_float("eta", 1e-8, 1.0, log=True),
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 50),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 0, 50, step=10),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "seed": seed,
    }
    dtrain = lgb.Dataset(X_train, y_train)
    dtest = lgb.Dataset(X_test, y_test)

    model = lgb.train(
        param,
        dtrain,
        valid_sets=[dtest],
        callbacks=[
            lgb.early_stopping(stopping_rounds=10, verbose=0),
        ],
    )
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)

    return accuracy_score(y_test, np.argmax(y_pred, axis=1))


# 10k random sample without stratification
# Best hyperparameters:  {'eta': 0.2946716146643395, 'boosting_type': 'gbdt', 'lambda_l1': 9.397345510495862, 'lambda_l2': 1.0215006016609774e-07, 'num_leaves': 6, 'min_data_in_leaf': 30, 'feature_fraction': 0.6779327568245341, 'bagging_fraction': 0.8997515432347486, 'bagging_freq': 5, 'min_child_samples': 34}
# Best score:  0.4394
study_name = "baseline_lgb"
# 30k random sample without stratification
# Best hyperparameters:  {'eta': 0.09723214967428116, 'boosting_type': 'gbdt', 'lambda_l1': 1.9100406735143505, 'lambda_l2': 3.5369799631250935e-07, 'num_leaves': 22, 'min_data_in_leaf': 20, 'feature_fraction': 0.4009733972022196, 'bagging_fraction': 0.8462707615712436, 'bagging_freq': 3, 'min_child_samples': 96}
# Best score:  0.45436666666666664
study_name = "baseline_lgb_30k"
# full dataset
# Best hyperparameters:  {'eta': 0.19997109376050565, 'boosting_type': 'gbdt', 'lambda_l1': 5.735491139313952e-07, 'lambda_l2': 3.1791646476628225e-06, 'num_leaves': 20, 'min_data_in_leaf': 30, 'feature_fraction': 0.8166299199026185, 'bagging_fraction': 0.9867527250605056, 'bagging_freq': 7, 'min_child_samples': 27}
# Best score:  0.46992574536754916
study_name = "baseline_lgb_full"

In [54]:
# optuna.delete_study(
#     study_name=study_name,
#     storage=STORAGE,
# )

In [55]:
study = run_optuna(
    study_name,
    lambda trial: objective(
        trial,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        seed=RANDOM_SEED,
    ),
    direction="maximize",
    n_trials=100,
    storage=STORAGE,
    seed=RANDOM_SEED,
)

[I 2024-05-31 01:16:00,810] A new study created in RDB with name: baseline_lgb_full
[I 2024-05-31 01:16:49,226] Trial 0 finished with value: 0.26639131514207687 and parameters: {'eta': 9.915644566638385e-06, 'boosting_type': 'gbdt', 'lambda_l1': 0.0024430162614261413, 'lambda_l2': 2.5361081166471375e-07, 'num_leaves': 9, 'min_data_in_leaf': 0, 'feature_fraction': 0.9197056874649611, 'bagging_fraction': 0.7606690070459252, 'bagging_freq': 5, 'min_child_samples': 6}. Best is trial 0 with value: 0.26639131514207687.
[I 2024-05-31 01:17:08,722] Trial 1 finished with value: 0.4467691335454894 and parameters: {'eta': 0.574485163632042, 'boosting_type': 'gbdt', 'lambda_l1': 4.329370014459266e-07, 'lambda_l2': 4.4734294104626844e-07, 'num_leaves': 16, 'min_data_in_leaf': 30, 'feature_fraction': 0.6591670111852694, 'bagging_fraction': 0.5747374841188252, 'bagging_freq': 5, 'min_child_samples': 18}. Best is trial 1 with value: 0.4467691335454894.
[I 2024-05-31 01:18:07,356] Trial 2 finished with

Best hyperparameters:  {'eta': 0.19997109376050565, 'boosting_type': 'gbdt', 'lambda_l1': 5.735491139313952e-07, 'lambda_l2': 3.1791646476628225e-06, 'num_leaves': 20, 'min_data_in_leaf': 30, 'feature_fraction': 0.8166299199026185, 'bagging_fraction': 0.9867527250605056, 'bagging_freq': 7, 'min_child_samples': 27}
Best score:  0.46992574536754916
