## Combine fe, bnum and dpi datasets in single model

In this notebook we will test how the two datasets fe and dpi work together to train a model.

After that we will combine all three datasets together to see how they perform together.


In [1]:
import pandas as pd
from utils import io
import project.bnum as bnum
import project.dpi as dpi
import project.project_api as project_api
import utils.model_lgb as model_lgb

In [2]:
RANDOM_SEED = 42
TARGET_KEY = "target"

train_fe_path = "./data/train_fe"
test_fe_path = "./data/test_fe"
train_bnum_path = "./data/train_bnum"
test_bnum_path = "./data/test_bnum"
train_dpi_path = "./data/train_dpi"
test_dpi_path = "./data/test_dpi"
fe_top75_features_path = "./data/fe_top75_features.json"
bnum_selection_path = "./data/bnum_initial_names.json"
bnum_top300_features_path = "./data/bnum_top300_features.json"
dpi_selection_path = "./data/dpi_initial_names.json"
dpi_top200_features_path = "./data/dpi_top200_features.json"

## Check how fe and dpi datasets perform together


Load `fe` dataset:


In [3]:
df_train = pd.read_parquet(train_fe_path)
df_test = pd.read_parquet(test_fe_path)

X_train = df_train.drop(columns=[TARGET_KEY])
# lgb requires zero-based classes
y_train = df_train[TARGET_KEY] - 1
del df_train

print(f"Train X: {X_train.shape}")
print(f"Train y: {y_train.shape}")

X_test = df_test.drop(columns=[TARGET_KEY])
# lgb requires zero-based classes
y_test = df_test[TARGET_KEY] - 1
del df_test

print(f"Test X: {X_test.shape}")
print(f"Test y: {y_test.shape}")

Train X: (146953, 815)
Train y: (146953,)
Test X: (44307, 815)
Test y: (44307,)


Load `dpi` dataset:


In [4]:
# just reuse previous dataset but select only top features
X_dpi200_train, y_dpi200_train = dpi.preprocess(
    name="dpi_initial_train",
    dpi_path=train_dpi_path,
    fe_path=train_fe_path,
    dpi_selection_path=dpi_selection_path,
    feature_selection_path=dpi_top200_features_path,
)
X_dpi200_test, y_dpi200_test = dpi.preprocess(
    name="dpi_initial_test",
    dpi_path=test_dpi_path,
    fe_path=test_fe_path,
    dpi_selection_path=dpi_selection_path,
    feature_selection_path=dpi_top200_features_path,
)

In [5]:
X_train_with_dpi_features = X_train.merge(
    X_dpi200_train,
    how="left",
    left_index=True,
    right_index=True,
)
X_test_with_dpi_features = X_test.merge(
    X_dpi200_test,
    how="left",
    left_index=True,
    right_index=True,
)

In [6]:
# Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.17850812760602092, 'num_leaves': 40, 'min_data_in_leaf': 50, 'feature_fraction': 0.7940659849414571, 'bagging_fraction': 0.8811304163286809, 'bagging_freq': 3, 'lambda_l1': 1.783324288962636, 'lambda_l2': 0.0005565598864854791}
# Best score:  0.5312704538786196
study_name = "test_fe_with_dpi_top200_features"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_train_with_dpi_features,
    y_train=y_train,
    X_test=X_test_with_dpi_features,
    y_test=y_test,
)

[I 2024-06-24 20:23:28,596] Using an existing study with name 'test_fe_with_dpi_top200_features' instead of creating a new one.


Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.17850812760602092, 'num_leaves': 40, 'min_data_in_leaf': 50, 'feature_fraction': 0.7940659849414571, 'bagging_fraction': 0.8811304163286809, 'bagging_freq': 3, 'lambda_l1': 1.783324288962636, 'lambda_l2': 0.0005565598864854791}
Best score:  0.5312704538786196


## Train model on top features from all datasets


In [7]:
fe_top75_features = io.read_json(fe_top75_features_path)
X_fe75_train = X_train[fe_top75_features]
X_fe75_test = X_test[fe_top75_features]

In [8]:
# just reuse previous dataset but select only top features
X_bnum300_train, y_bnum300_train = bnum.preprocess(
    name="bnum_initial_train",
    bnum_path=train_bnum_path,
    fe_path=train_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=bnum_top300_features_path,
)
X_bnum300_test, y_bnum300_test = bnum.preprocess(
    name="bnum_initial_test",
    bnum_path=test_bnum_path,
    fe_path=test_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=bnum_top300_features_path,
)

In [9]:
X_all_train = X_fe75_train.merge(
    X_bnum300_train,
    how="left",
    left_index=True,
    right_index=True,
).merge(
    X_dpi200_train,
    how="left",
    left_index=True,
    right_index=True,
)

print(f"Train X: {X_all_train.shape}")
print(f"Train y: {y_train.shape}")

X_all_test = X_fe75_test.merge(
    X_bnum300_test,
    how="left",
    left_index=True,
    right_index=True,
).merge(
    X_dpi200_test,
    how="left",
    left_index=True,
    right_index=True,
)

print(f"Test X: {X_all_test.shape}")
print(f"Test y: {y_test.shape}")

Train X: (146953, 575)
Train y: (146953,)
Test X: (44307, 575)
Test y: (44307,)


In [10]:
study_name = "all_in_one_baseline"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_all_train,
    y_train=y_train,
    X_test=X_all_test,
    y_test=y_test,
)

[I 2024-06-24 20:23:33,382] Using an existing study with name 'all_in_one_baseline' instead of creating a new one.
[I 2024-06-24 20:23:34,483] Trial 5 pruned. 
[I 2024-06-24 20:23:35,303] Trial 6 pruned. 
[I 2024-06-24 20:23:36,077] Trial 7 pruned. 
[I 2024-06-24 20:23:37,750] Trial 8 pruned. 
[I 2024-06-24 20:36:00,675] Trial 9 finished with value: 0.26639131514207687 and parameters: {'boosting_type': 'dart', 'eta': 1.8841183049085085e-08, 'num_leaves': 46, 'min_data_in_leaf': 10, 'feature_fraction': 0.7975133706123891, 'bagging_fraction': 0.5870266456536466, 'bagging_freq': 4, 'lambda_l1': 0.0008325158565947976, 'lambda_l2': 4.609885087947832e-07}. Best is trial 1 with value: 0.4680073126142596.
[I 2024-06-24 20:36:25,287] Trial 10 finished with value: 0.5071884803755614 and parameters: {'boosting_type': 'gbdt', 'eta': 0.32808889626606236, 'num_leaves': 45, 'min_data_in_leaf': 30, 'feature_fraction': 0.9531245410138701, 'bagging_fraction': 0.4530955012311517, 'bagging_freq': 2, 'lamb

Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.1788056306137465, 'num_leaves': 46, 'min_data_in_leaf': 50, 'feature_fraction': 0.6912219810888475, 'bagging_fraction': 0.981859546023039, 'bagging_freq': 4, 'lambda_l1': 5.279524203599129e-06, 'lambda_l2': 9.244959331553632e-06}
Best score:  0.5360552508632948


## Dataset combination summary

| Dataset                                                     | Accuracy | Delta  |
| ----------------------------------------------------------- | -------- | ------ |
| **baseline**: fe (814 features)                             | 46.99%   | 0      |
| bnum (2719 features)                                        | 39.67%   | -7.32% |
| dpi (2888 features)                                         | 50.39%   | 3.4%   |
| fe (814 features) + bnum (300 features)                     | 48.43%   | 1.44%  |
| fe (814 features) + dpi (200 features)                      | 53.13%   | 6.14%  |
| fe (75 features) + bnum (300 features) + dpi (200 features) | 53.61%   | 6.62%  |
