# Test fe dataset in combination with bnum

In this notebook we will test how the two datasets fe and bnum work together to train a model. We will compare two strategies:

- train model on bnum dataset and use it as feature for fe dataset and then train the final model on that dataset
- add bnum features to fe dataset and train one model for all features

We also want to compare 40 and 300 columns of bnum dataset to findout which will give us a better compromise.

## TLDR

- using predictions as features shows slightly worse results (`0.4734` vs `0.4756`). Probably because this reduces information available for model
- combining top300 bnum features produces better results then top40 (`0.4843` vs `0.4756`)
- combining two datasets delivered 1.4% improvement over baseline model `0.4843` vs `0.4699`.


In [1]:
import pandas as pd
import project.bnum as bnum
import project.project_api as project_api
import utils.model_lgb as model_lgb

In [2]:
RANDOM_SEED = 42
TARGET_KEY = "target"

train_fe_path = "./data/train_fe"
test_fe_path = "./data/test_fe"
train_bnum_path = "./data/train_bnum"
test_bnum_path = "./data/test_bnum"
bnum_selection_path = "./data/bnum_initial_names.json"
top40_features_path = "./data/bnum_top40_features.json"
top300_features_path = "./data/bnum_top300_features.json"

Load `fe` dataset:


In [3]:
df_train = pd.read_parquet(train_fe_path)
df_test = pd.read_parquet(test_fe_path)

X_train = df_train.drop(columns=[TARGET_KEY])
# lgb requires zero-based classes
y_train = df_train[TARGET_KEY] - 1
X_test = df_test.drop(columns=[TARGET_KEY])
# lgb requires zero-based classes
y_test = df_test[TARGET_KEY] - 1

del df_train
del df_test

print(f"Train X: {X_train.shape}")
print(f"Train y: {y_train.shape}")
print(f"Test X: {X_test.shape}")
print(f"Test y: {y_test.shape}")

Train X: (146953, 815)
Train y: (146953,)
Test X: (44307, 815)
Test y: (44307,)


Load `bnum` dataset:


In [4]:
# just reuse previous dataset but select only top features
X_bnum40_train, y_bnum40_train = bnum.preprocess(
    name="bnum_initial_train",
    bnum_path=train_bnum_path,
    fe_path=train_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=top40_features_path,
)
X_bnum40_test, y_bnum40_test = bnum.preprocess(
    name="bnum_initial_test",
    bnum_path=test_bnum_path,
    fe_path=test_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=top40_features_path,
)

In [5]:
# lgb_top300 = model_lgb.load("2024_06_12_bnum_top300_acc_3934")
lgb_top40_pred, _ = model_lgb.load("2024_06_12_bnum_top40_acc_351")

bnum40_train_preds = lgb_top40_pred(X_bnum40_train)
bnum40_test_preds = lgb_top40_pred(X_bnum40_test)

In [6]:
X_train_with_top40_preds = X_train.merge(
    pd.DataFrame(
        {"bnum_class": bnum40_train_preds},
        index=X_bnum40_train.index,
    ),
    how="left",
    left_index=True,
    right_index=True,
).fillna(-1)
X_test_with_top40_preds = X_test.merge(
    pd.DataFrame(
        {"bnum_class": bnum40_test_preds},
        index=X_bnum40_test.index,
    ),
    how="left",
    left_index=True,
    right_index=True,
).fillna(-1)

In [7]:
# Best hyperparameters:  {'eta': 0.15834994984280545, 'boosting_type': 'gbdt', 'lambda_l1': 0.000299385503046467, 'lambda_l2': 2.458176583063629, 'num_leaves': 22, 'min_data_in_leaf': 50, 'feature_fraction': 0.9569277875988987, 'bagging_fraction': 0.9432542035138796, 'bagging_freq': 3}
# Best score:  0.4734240639176654
study_name = "test_fe_with_bnum_top40_preds"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_train_with_top40_preds,
    y_train=y_train,
    X_test=X_test_with_top40_preds,
    y_test=y_test,
)

[I 2024-07-08 17:48:34,538] Using an existing study with name 'test_fe_with_bnum_top40_preds' instead of creating a new one.


Best hyperparameters:  {'eta': 0.15834994984280545, 'boosting_type': 'gbdt', 'lambda_l1': 0.000299385503046467, 'lambda_l2': 2.458176583063629, 'num_leaves': 22, 'min_data_in_leaf': 50, 'feature_fraction': 0.9569277875988987, 'bagging_fraction': 0.9432542035138796, 'bagging_freq': 3}
Best score:  0.4734240639176654


In [8]:
X_train_with_top40_features = X_train.merge(
    X_bnum40_train,
    how="left",
    left_index=True,
    right_index=True,
).fillna(-1)
X_test_with_top40_features = X_test.merge(
    X_bnum40_test,
    how="left",
    left_index=True,
    right_index=True,
).fillna(-1)

In [9]:
# Best hyperparameters:  {'eta': 0.20698979175140425, 'boosting_type': 'gbdt', 'lambda_l1': 1.3608943424323632e-07, 'lambda_l2': 6.894263711482318e-08, 'num_leaves': 12, 'min_data_in_leaf': 50, 'feature_fraction': 0.9391108952876936, 'bagging_fraction': 0.8290092482261175, 'bagging_freq': 6}
# Best score:  0.4755907644390277
study_name = "test_fe_with_bnum_top40_features"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_train_with_top40_features,
    y_train=y_train,
    X_test=X_test_with_top40_features,
    y_test=y_test,
)

[I 2024-07-08 17:48:36,585] Using an existing study with name 'test_fe_with_bnum_top40_features' instead of creating a new one.


Best hyperparameters:  {'eta': 0.20698979175140425, 'boosting_type': 'gbdt', 'lambda_l1': 1.3608943424323632e-07, 'lambda_l2': 6.894263711482318e-08, 'num_leaves': 12, 'min_data_in_leaf': 50, 'feature_fraction': 0.9391108952876936, 'bagging_fraction': 0.8290092482261175, 'bagging_freq': 6}
Best score:  0.4755907644390277


In [10]:
# just reuse previous dataset but select only top features
X_bnum300_train, y_bnum300_train = bnum.preprocess(
    name="bnum_initial_train",
    bnum_path=train_bnum_path,
    fe_path=train_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=top300_features_path,
)
X_bnum300_test, y_bnum300_test = bnum.preprocess(
    name="bnum_initial_test",
    bnum_path=test_bnum_path,
    fe_path=test_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=top300_features_path,
)

X_train_with_top300_features = X_train.merge(
    X_bnum300_train,
    how="left",
    left_index=True,
    right_index=True,
).fillna(-1)
X_test_with_top300_features = X_test.merge(
    X_bnum300_test,
    how="left",
    left_index=True,
    right_index=True,
).fillna(-1)

In [11]:
# Best hyperparameters:  {'eta': 0.14136817539870614, 'boosting_type': 'gbdt', 'lambda_l1': 0.9466691463188805, 'lambda_l2': 6.022126036738851e-06, 'num_leaves': 28, 'min_data_in_leaf': 50, 'feature_fraction': 0.9163666848899368, 'bagging_fraction': 0.9234875985008758, 'bagging_freq': 4}
# Best score:  0.484302706118672
study_name = "test_fe_with_bnum_top300_features"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_train_with_top300_features,
    y_train=y_train,
    X_test=X_test_with_top300_features,
    y_test=y_test,
)

[I 2024-07-08 17:48:41,058] Using an existing study with name 'test_fe_with_bnum_top300_features' instead of creating a new one.


Best hyperparameters:  {'eta': 0.14136817539870614, 'boosting_type': 'gbdt', 'lambda_l1': 0.9466691463188805, 'lambda_l2': 6.022126036738851e-06, 'num_leaves': 28, 'min_data_in_leaf': 50, 'feature_fraction': 0.9163666848899368, 'bagging_fraction': 0.9234875985008758, 'bagging_freq': 4}
Best score:  0.484302706118672


## Conclusion

We've got 1.4% improvement over baseline model `0.4843` vs `0.4699`.
