## Combine fe, bnum and dpi datasets in single model

In this notebook we will test how the two datasets fe and dpi work together to train a model.

After that we will combine all three datasets together to see how they perform together.


In [1]:
import pandas as pd
from utils import io
import project.bnum as bnum
import project.dpi as dpi
import project.project_api as project_api
import utils.model_lgb as model_lgb

In [2]:
RANDOM_SEED = 42
TARGET_KEY = "target"
CLASS_NUM = 5

train_fe_path = "./data/train_fe"
test_fe_path = "./data/test_fe"
train_bnum_path = "./data/train_bnum"
test_bnum_path = "./data/test_bnum"
train_dpi_path = "./data/train_dpi"
test_dpi_path = "./data/test_dpi"
fe_top75_features_path = "./data/fe_top75_features.json"
bnum_selection_path = "./data/bnum_initial_names.json"
bnum_top300_features_path = "./data/bnum_top300_features.json"
dpi_selection_path = "./data/dpi_initial_names.json"
dpi_top200_features_path = "./data/dpi_top200_features.json"

## Check how fe and dpi datasets perform together


Load `fe` dataset:


In [3]:
df_train = pd.read_parquet(train_fe_path)
df_test = pd.read_parquet(test_fe_path)

X_train = df_train.drop(columns=[TARGET_KEY])
# lgb requires zero-based classes
y_train = df_train[TARGET_KEY] - 1
del df_train

print(f"Train X: {X_train.shape}")
print(f"Train y: {y_train.shape}")

X_test = df_test.drop(columns=[TARGET_KEY])
# lgb requires zero-based classes
y_test = df_test[TARGET_KEY] - 1
del df_test

print(f"Test X: {X_test.shape}")
print(f"Test y: {y_test.shape}")

Train X: (146953, 815)
Train y: (146953,)
Test X: (44307, 815)
Test y: (44307,)


Load `dpi` dataset:


In [4]:
# just reuse previous dataset but select only top features
X_dpi200_train, y_dpi200_train = dpi.preprocess(
    name="dpi_initial_train",
    dpi_path=train_dpi_path,
    fe_path=train_fe_path,
    dpi_selection_path=dpi_selection_path,
    feature_selection_path=dpi_top200_features_path,
)
del y_dpi200_train
X_dpi200_test, y_dpi200_test = dpi.preprocess(
    name="dpi_initial_test",
    dpi_path=test_dpi_path,
    fe_path=test_fe_path,
    dpi_selection_path=dpi_selection_path,
    feature_selection_path=dpi_top200_features_path,
)
del y_dpi200_test

In [5]:
X_train_with_dpi_features = X_train.merge(
    X_dpi200_train,
    how="left",
    left_index=True,
    right_index=True,
)
X_test_with_dpi_features = X_test.merge(
    X_dpi200_test,
    how="left",
    left_index=True,
    right_index=True,
)

In [6]:
# Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.17850812760602092, 'num_leaves': 40, 'min_data_in_leaf': 50, 'feature_fraction': 0.7940659849414571, 'bagging_fraction': 0.8811304163286809, 'bagging_freq': 3, 'lambda_l1': 1.783324288962636, 'lambda_l2': 0.0005565598864854791}
# Best score:  0.5312704538786196
study_name = "test_fe_with_dpi_top200_features"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_train_with_dpi_features,
    y_train=y_train,
    X_test=X_test_with_dpi_features,
    y_test=y_test,
)
del X_train_with_dpi_features
del X_test_with_dpi_features

[I 2024-06-26 11:29:04,665] Using an existing study with name 'test_fe_with_dpi_top200_features' instead of creating a new one.


Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.17850812760602092, 'num_leaves': 40, 'min_data_in_leaf': 50, 'feature_fraction': 0.7940659849414571, 'bagging_fraction': 0.8811304163286809, 'bagging_freq': 3, 'lambda_l1': 1.783324288962636, 'lambda_l2': 0.0005565598864854791}
Best score:  0.5312704538786196


## Train model on top features from all datasets


In [7]:
fe_top75_features = io.read_json(fe_top75_features_path)
X_fe75_train = X_train[fe_top75_features]
X_fe75_test = X_test[fe_top75_features]

In [8]:
# just reuse previous dataset but select only top features
X_bnum300_train, y_bnum300_train = bnum.preprocess(
    name="bnum_initial_train",
    bnum_path=train_bnum_path,
    fe_path=train_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=bnum_top300_features_path,
)
del y_bnum300_train
X_bnum300_test, y_bnum300_test = bnum.preprocess(
    name="bnum_initial_test",
    bnum_path=test_bnum_path,
    fe_path=test_fe_path,
    bnum_selection_path=bnum_selection_path,
    feature_selection_path=bnum_top300_features_path,
)
del y_bnum300_test


In [9]:
X_all_train = X_fe75_train.merge(
    X_bnum300_train,
    how="left",
    left_index=True,
    right_index=True,
).merge(
    X_dpi200_train,
    how="left",
    left_index=True,
    right_index=True,
)

print(f"Train X: {X_all_train.shape}")
print(f"Train y: {y_train.shape}")

X_all_test = X_fe75_test.merge(
    X_bnum300_test,
    how="left",
    left_index=True,
    right_index=True,
).merge(
    X_dpi200_test,
    how="left",
    left_index=True,
    right_index=True,
)

print(f"Test X: {X_all_test.shape}")
print(f"Test y: {y_test.shape}")

del X_train
del X_fe75_train
del X_bnum300_train
del X_dpi200_train

del X_test
del X_fe75_test
del X_dpi200_test
del X_bnum300_test

Train X: (146953, 575)
Train y: (146953,)
Test X: (44307, 575)
Test y: (44307,)


In [10]:
# Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.1788056306137465, 'num_leaves': 46, 'min_data_in_leaf': 50, 'feature_fraction': 0.6912219810888475, 'bagging_fraction': 0.981859546023039, 'bagging_freq': 4, 'lambda_l1': 5.279524203599129e-06, 'lambda_l2': 9.244959331553632e-06}
# Best score:  0.5360552508632948
study_name = "all_in_one_baseline"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_all_train,
    y_train=y_train,
    X_test=X_all_test,
    y_test=y_test,
)

[I 2024-06-26 11:29:11,324] Using an existing study with name 'all_in_one_baseline' instead of creating a new one.


Best hyperparameters:  {'boosting_type': 'gbdt', 'eta': 0.1788056306137465, 'num_leaves': 46, 'min_data_in_leaf': 50, 'feature_fraction': 0.6912219810888475, 'bagging_fraction': 0.981859546023039, 'bagging_freq': 4, 'lambda_l1': 5.279524203599129e-06, 'lambda_l2': 9.244959331553632e-06}
Best score:  0.5360552508632948


## Dataset combination summary

| Dataset                                                     | Accuracy | Delta  |
| ----------------------------------------------------------- | -------- | ------ |
| **baseline**: fe (814 features)                             | 46.99%   | 0      |
| bnum (2719 features)                                        | 39.67%   | -7.32% |
| dpi (2888 features)                                         | 50.39%   | 3.4%   |
| fe (814 features) + bnum (300 features)                     | 48.43%   | 1.44%  |
| fe (814 features) + dpi (200 features)                      | 53.13%   | 6.14%  |
| fe (75 features) + bnum (300 features) + dpi (200 features) | 53.61%   | 6.62%  |


## Feature selection

In [11]:
from BorutaShap import BorutaShap
from lightgbm import LGBMClassifier


def calc_boruta():
    Feature_Selector = BorutaShap(
        importance_measure="shap",
        classification=True,
        model=LGBMClassifier(
            **{
                "boosting_type": "gbdt",
                "eta": 0.1788056306137465,
                "num_leaves": 46,
                "min_data_in_leaf": 50,
                "feature_fraction": 0.6912219810888475,
                "bagging_fraction": 0.981859546023039,
                "bagging_freq": 4,
                "lambda_l1": 5.279524203599129e-06,
                "lambda_l2": 9.244959331553632e-06,
                "num_class": CLASS_NUM,
                "objective": "multiclass",
                "metric": "multi_logloss",
                "seed": RANDOM_SEED,
                "verbosity": -1,
            }
        ),
    )

    Feature_Selector.fit(
        X=X_all_train.fillna(-1),
        y=y_train,
        # unfortunately the 'test' value does not makes the lib to measure importance
        # on test set https://github.com/Ekeany/Boruta-Shap/issues/126
        # moreover this lib makes train/test split under the hood so we can not feed
        # it with our test set
        train_or_test="train",
        random_state=RANDOM_SEED,
        n_trials=50,
    )

    Feature_Selector.results_to_csv("./data/borutashap_feature_importance")

    return pd.read_csv("./data/borutashap_feature_importance.csv")


df_boruta = io.run_cached("./data/borutashap_feature_importance.parquet", calc_boruta)

attr_important = df_boruta[df_boruta["Decision"] == "Accepted"][
    "Features"
].values.tolist()
attr_tentative = df_boruta[df_boruta["Decision"] == "Tentative"][
    "Features"
].values.tolist()
attr_rejected = df_boruta[df_boruta["Decision"] == "Rejected"][
    "Features"
].values.tolist()

print(f"{len(attr_important)} attributes confirmed important: {attr_important}")
print(f"\n\n{len(attr_tentative)} attributes confirmed tentative: {attr_tentative}")
print(f"\n\n{len(attr_rejected)} attributes confirmed unimportant: {attr_rejected}")

df_boruta

227 attributes confirmed important: ['lt', 'SUM_of_Volume_kb_814', 'imei_mean_days_usage', 'SUM_of_Volume_kb_254', 'SUM_of_Duration_sec_814', 'SUM_of_Volume_kb_240', 'SUM_of_Count_events_814', 'voice_in_mts_avg_dur_mea_mnt3', 'SUM_of_Count_events_240', 'SUM_of_Duration_sec_240', 'myvf_day_usage', 'imei_mean_long_days_usage', 'imei_mean_day_announced', 'SUM_of_Volume_kb_777', 'Balance_uah', 'voice_in_fix_tar_dur_mea_mnt3', 'TM_ID', 'SUM_of_Count_events_320', 'voice_in_mts_avg_dur_min_mnt3', 'min_paym_6_month', 'imei_max_price', 'SUM_of_Duration_sec_1414', 'conn_out_uniq_cnt_max_mnt3', 'voice_in_cmpttrs_avg_durmea_mnt3', 'conn_out_uniq_cnt_mea_mnt3', 'SUM_of_Duration_sec_1503', 'all_clc_mea_mnt3', 'DNZ_DAYS_from_last_year5', 'Internet', 'SUM_of_Volume_kb_267', 'daily_session_cnt_rate_1020', 'MAX_of_day_cnt_267', 'SUM_of_Count_events_1414', 'imei_mean_price', 'SUM_of_Volume_kb_246', 'conn_in_uniq_cnt_mea_mnt3', 'tsoa_call_cnt', 'loc_market_share', 'DNZ_MAX_days_closed_loan_year5', 'block_

Unnamed: 0,Features,Average Feature Importance,Standard Deviation Importance,Decision
0,lt,19.708844,2.917435,Accepted
1,SUM_of_Volume_kb_814,13.598462,2.249856,Accepted
2,imei_mean_days_usage,8.243601,1.285403,Accepted
3,SUM_of_Volume_kb_254,7.473078,1.242717,Accepted
4,SUM_of_Duration_sec_814,5.491028,0.854473,Accepted
...,...,...,...,...
574,call_dur_380674660466,-0.235718,0.000728,Rejected
575,cnt_sms_e-wings,-0.235718,0.000728,Rejected
576,Mean_Shadow,-0.249103,0.049691,Shadow
577,Median_Shadow,-0.256503,0.046306,Shadow


In [12]:
# Feature_Selector.plot(which_features='all')

In [13]:
boruta_features = attr_important
boruta_with_tentative = attr_important + attr_tentative

In [14]:
def eval_model(
    study_name: str,
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_test: pd.DataFrame,
    params: dict,
):
    predict, model = model_lgb.train_multiclass(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        params=params,
        num_class=CLASS_NUM,
        seed=RANDOM_SEED,
        name=f"2024_06_26_{study_name}" if study_name else None,
    )

    print("Train dataset:")
    project_api.report(
        y_test=y_train,
        y_pred=predict(X_train),
    )

    print("\n\nTest dataset:")
    project_api.report(
        y_test=y_test,
        y_pred=predict(X_test),
    )

In [15]:
print("Baseline:")
eval_model(
    study_name="all_in_one_baseline_acc_5361",
    X_train=X_all_train,
    y_train=y_train,
    X_test=X_all_test,
    y_test=y_test,
    params=study.best_params,
)

print("\n\nBoruta:")
eval_model(
    study_name="all_in_one_baseline_boruta_acc_5361",
    X_train=X_all_train[boruta_features],
    y_train=y_train,
    X_test=X_all_test[boruta_features],
    y_test=y_test,
    params=study.best_params,
)

print("\n\nBoruta with tentative features:")
eval_model(
    study_name="all_in_one_baseline_boruta_w_tentative_acc_5361",
    X_train=X_all_train[boruta_with_tentative],
    y_train=y_train,
    X_test=X_all_test[boruta_with_tentative],
    y_test=y_test,
    params=study.best_params,
)

Baseline:
Train dataset:
Accuracy: 0.6828441746680911


Test dataset:
Accuracy: 0.5360552508632948


Boruta:
Train dataset:
Accuracy: 0.6791150912196416


Test dataset:
Accuracy: 0.5296454284875979


Boruta with tentative features:
Train dataset:
Accuracy: 0.6819731478772124


Test dataset:
Accuracy: 0.5310898955018395


In [16]:
# Best hyperparameters:  {'boosting_type': 'dart', 'eta': 0.24512561327347876, 'num_leaves': 57, 'min_data_in_leaf': 65, 'feature_fraction': 0.5, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 9, 'lambda_l1': 0.0005743588371627974, 'lambda_l2': 0.8829846607784294}
# Best score:  0.5270499018213827
study_name = "all_in_one_boruta"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_all_train[boruta_features],
    y_train=y_train,
    X_test=X_all_test[boruta_features],
    y_test=y_test,
)

[I 2024-06-26 11:29:28,713] Using an existing study with name 'all_in_one_boruta' instead of creating a new one.


Best hyperparameters:  {'boosting_type': 'dart', 'eta': 0.24512561327347876, 'num_leaves': 57, 'min_data_in_leaf': 65, 'feature_fraction': 0.5, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 9, 'lambda_l1': 0.0005743588371627974, 'lambda_l2': 0.8829846607784294}
Best score:  0.5270499018213827


In [17]:
print("Boruta:")
eval_model(
    study_name="all_in_one_boruta_acc_5270",
    X_train=X_all_train[boruta_features],
    y_train=y_train,
    X_test=X_all_test[boruta_features],
    y_test=y_test,
    params=study.best_params,
)

Boruta:
Train dataset:
Accuracy: 0.6306165916993869


Test dataset:
Accuracy: 0.5263502381113594


In [18]:
# Best hyperparameters:  {'boosting_type': 'dart', 'eta': 0.20851366138236294, 'num_leaves': 82, 'min_data_in_leaf': 65, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 3, 'lambda_l1': 0.0020068560603866297, 'lambda_l2': 8.288993588619931e-06}
# Best score:  0.5301871036179385
study_name = "all_in_one_boruta_tentative"

study = project_api.train_lgb(
    study_name=study_name,
    X_train=X_all_train[boruta_with_tentative],
    y_train=y_train,
    X_test=X_all_test[boruta_with_tentative],
    y_test=y_test,
)

[I 2024-06-26 11:29:34,036] Using an existing study with name 'all_in_one_boruta_tentative' instead of creating a new one.


Best hyperparameters:  {'boosting_type': 'dart', 'eta': 0.20851366138236294, 'num_leaves': 82, 'min_data_in_leaf': 65, 'feature_fraction': 0.4, 'bagging_fraction': 1.0, 'bagging_freq': 3, 'lambda_l1': 0.0020068560603866297, 'lambda_l2': 8.288993588619931e-06}
Best score:  0.5301871036179385


In [19]:
print("Boruta with tentative features:")
eval_model(
    study_name="all_in_one_tentative_acc_5301",
    X_train=X_all_train[boruta_with_tentative],
    y_train=y_train,
    X_test=X_all_test[boruta_with_tentative],
    y_test=y_test,
    params=study.best_params,
)

Boruta with tentative features:
Train dataset:
Accuracy: 0.6635931216103108


Test dataset:
Accuracy: 0.5291037533572573


## Conclusion

BorutaShap has shown good results. We have dropped more then a half of features without loosing in accuracy. Tentative features don't show significant impact on the model accuracy so we will drop them too. Unfortunately feature selection haven't give us any help with reducing model overfitting on train set.