# Extra feature interaction for dpi dataset

In this notebook we will check whether additional feature interaction per app may improve model accuracy.

## TLDR

- tried to add more feature interaction for dpi dataset but without any significant approach
- also tried to brute force all possible combination (not included in notebook) but this approach requires to much computation power


In [1]:
import pandas as pd
import utils.model_lgb as model_lgb
import project.project_api as project_api
import project.dpi as dpi

In [2]:
RANDOM_SEED = 42
TARGET_KEY = "target"
CLASS_NUM = 5

## Logical approach

In [3]:
def load_data():
    top200_features_path = "./data/dpi_top200_features.json"
    dpi_selection_path = "./data/dpi_initial_names.json"

    # just reuse previous dataset but select only top features
    X_train, y_train = dpi.preprocess(
        name="dpi_initial_train",
        dpi_path="./data/train_dpi",
        fe_path="./data/train_fe",
        dpi_selection_path=dpi_selection_path,
        feature_selection_path=top200_features_path,
    )
    print(f"Train X: {X_train.shape}")
    print(f"Train y: {y_train.shape}")

    X_test, y_test = dpi.preprocess(
        name="dpi_initial_test",
        dpi_path="./data/test_dpi",
        fe_path="./data/test_fe",
        dpi_selection_path=dpi_selection_path,
        feature_selection_path=top200_features_path,
    )
    print(f"Test X: {X_test.shape}")
    print(f"Test y: {y_test.shape}")

    return X_train, y_train, X_test, y_test


X_train, y_train, X_test, y_test = load_data()

Train X: (119798, 200)
Train y: (119798, 1)
Test X: (36050, 200)
Test y: (36050, 1)


In [4]:
apps = set()

for col in X_train.columns:
    apps.add(col.split("_")[-1])


apps.remove("app")
apps.remove("cnt")
apps.remove("kb")
apps.remove("kbs")
apps.remove("sec")
apps.remove("sum")

apps

{'1020',
 '1091',
 '1119',
 '1125',
 '1157',
 '1288',
 '1298',
 '1301',
 '1314',
 '1328',
 '1339',
 '1345',
 '1355',
 '1365',
 '1373',
 '1394',
 '1397',
 '1399',
 '1400',
 '1414',
 '1462',
 '1465',
 '1472',
 '1479',
 '1503',
 '1523',
 '175',
 '1871',
 '1901',
 '1914',
 '1998',
 '2010',
 '2018',
 '2027',
 '240',
 '246',
 '254',
 '2542',
 '2555',
 '2558',
 '257',
 '258',
 '262',
 '267',
 '274',
 '315',
 '320',
 '381',
 '546',
 '588',
 '589',
 '590',
 '598',
 '604',
 '677',
 '690',
 '696',
 '697',
 '710',
 '716',
 '739',
 '777',
 '814',
 '815',
 '819',
 '829',
 '833',
 '834',
 '847',
 '850',
 '851',
 '868',
 '882',
 '884',
 '888',
 '897',
 '933',
 '948',
 '964',
 '992'}

In [5]:
# MAX_of_day_cnt
# SUM_of_Duration_sec
# SUM_of_Count_events
# SUM_of_Volume_kb


def create_features(df: pd.DataFrame):
    for app_id in apps:
        key_days = f"MAX_of_day_cnt_{app_id}"
        key_ses_dur = f"SUM_of_Duration_sec_{app_id}"
        key_ses_cnt = f"SUM_of_Count_events_{app_id}"
        key_vl_kb = f"SUM_of_Volume_kb_{app_id}"

        if key_days in df:
            if key_ses_dur in df:
                df[f"daily_session_dur_{app_id}"] = df[key_ses_dur] / df[key_days]
            if key_ses_cnt in df:
                df[f"daily_session_cnt_{app_id}"] = df[key_ses_cnt] / df[key_days]
            if key_vl_kb in df:
                df[f"daily_volume_kb_{app_id}"] = df[key_vl_kb] / df[key_days]

        if key_ses_cnt in df:
            if key_ses_dur in df:
                df[f"avg_session_dur_{app_id}"] = df[key_ses_dur] / df[key_ses_cnt]
            if key_vl_kb in df:
                df[f"avg_session_volume_kb_{app_id}"] = df[key_vl_kb] / df[key_ses_cnt]

        if key_ses_dur in df:
            if key_vl_kb in df:
                df[f"vl_kb_per_sec_{app_id}"] = df[key_vl_kb] / df[key_ses_dur]


create_features(X_train)
create_features(X_test)

In [6]:
X_train

Unnamed: 0_level_0,SUM_of_Volume_kb_814,SUM_of_Volume_kb_254,SUM_of_Duration_sec_814,SUM_of_Volume_kb_240,SUM_of_Count_events_814,SUM_of_Duration_sec_240,SUM_of_Duration_sec_1414,SUM_of_Count_events_240,SUM_of_Volume_kb_246,SUM_of_Count_events_677,...,avg_session_volume_kb_254,vl_kb_per_sec_254,vl_kb_per_sec_1373,avg_session_volume_kb_819,daily_session_dur_710,daily_volume_kb_710,vl_kb_per_sec_710,avg_session_dur_381,avg_session_volume_kb_381,vl_kb_per_sec_381
abon_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1525928,0.0000,74.75000,0.00000,144.125000,0.000000,103.750000,0.00000,59.968750,1.418945,71.625000,...,2.619141,1.296875,1.071289,26.953125,,,,1.856445,3.939453,2.123047
1530471,99.8125,104.75000,87.00000,67.312500,38.375000,69.750000,0.00000,33.843750,7.726562,39.406250,...,2.189453,0.985352,0.770508,,,,,1.323242,3.732422,2.820312
1541528,147.5000,221.87500,85.43750,203.875000,43.156250,160.750000,41.09375,107.562500,14.578125,97.312500,...,2.775391,1.501953,0.933594,4.093750,,,,1.190430,2.351562,1.975586
1545231,0.0000,0.00000,0.00000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,10.265625,...,,,,,,,,,,
1553098,67.9375,68.06250,37.56250,184.250000,4.210938,102.937500,0.00000,60.375000,0.000000,68.562500,...,3.091797,1.125977,1.148438,16.109375,,,,1.676758,2.882812,1.718750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131293833,206.5000,251.00000,110.56250,143.500000,55.000000,108.187500,192.87500,68.500000,1.036133,84.562500,...,3.388672,1.540039,1.084961,10.367188,,,,1.302734,2.375000,1.823242
131293849,173.8750,206.12500,103.81250,150.375000,52.281250,116.187500,223.37500,75.500000,19.875000,88.562500,...,2.785156,1.398438,1.019531,3.951172,,,,1.650391,2.806641,1.700195
131295221,173.3750,180.25000,92.25000,107.687500,47.250000,67.375000,121.81250,42.187500,20.531250,57.875000,...,3.607422,1.600586,0.866211,4.984375,,,,2.148438,3.050781,1.418945
131296448,244.3750,268.25000,135.75000,153.875000,100.562500,116.125000,205.12500,71.375000,73.312500,94.312500,...,2.476562,1.401367,1.071289,4.730469,,,,1.373047,2.449219,1.783203


In [7]:
def evaluate_dataset(name: str):
    print(f"X: {X_train.shape}")

    train_params = {
        "boosting_type": "gbdt",
        "eta": 0.15688979353218008,
        "num_leaves": 46,
        "min_data_in_leaf": 10,
        "feature_fraction": 0.43856302939789465,
        "bagging_fraction": 0.9440401915645933,
        "bagging_freq": 7,
        "lambda_l1": 1.7400459332852637e-07,
        "lambda_l2": 1.8508205572756913,
    }
    predict_dpi, model_dpi = model_lgb.train_multiclass(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        params=train_params,
        num_class=CLASS_NUM,
        seed=RANDOM_SEED,
        name=f"2024_06_29_experiment_{name}",
    )

    print("Train dataset:")
    project_api.report(
        y_test=y_train,
        y_pred=predict_dpi(X_train),
    )

    print("\n\nTest dataset:")
    project_api.report(
        y_test=y_test,
        y_pred=predict_dpi(X_test),
    )


evaluate_dataset("extra_dpi_features")

X: (119798, 291)
Train dataset:
Accuracy: 0.4913103724603082


Test dataset:
Accuracy: 0.4416643550624133


## Conclusion

Extra aggregations per app haven't shown any significance.