In [1]:
def topx_indexes(dataframe, nclasses):
    grouped_counts = dataframe.groupby("APP").size()
    grouped_counts = grouped_counts.sort_values(ascending=False)

    topx_groups = grouped_counts.head(nclasses).index

    return topx_groups

def QUIC_dataset(nclasses = 0):
    from cesnet_datazoo.datasets import CESNET_QUIC22
    from cesnet_datazoo.config import DatasetConfig, AppSelection, ValidationApproach

    dataset = CESNET_QUIC22("~/datasets/CESNET-QUIC22/", size="XS")

    common_params = {
        "dataset" : dataset,
        "apps_selection" : AppSelection.ALL_KNOWN,
        "test_period_name" : "W-2022-44",
        "val_approach": ValidationApproach.SPLIT_FROM_TRAIN,
        "train_val_split_fraction": 0.2
    }

    dataset_config = DatasetConfig(**common_params)
    dataset.set_dataset_config_and_initialize(dataset_config)
    train_dataframe = dataset.get_train_df(flatten_ppi=True)
    val_dataframe = dataset.get_val_df(flatten_ppi=True)
    test_dataframe = dataset.get_test_df(flatten_ppi=True)

    if nclasses != 0:
        topx_groups = topx_indexes(train_dataframe, nclasses)

        train_dataframe = train_dataframe[train_dataframe["APP"].isin(topx_groups)]
        test_dataframe  = test_dataframe[test_dataframe["APP"].isin(topx_groups)]
        val_dataframe   = val_dataframe[val_dataframe["APP"].isin(topx_groups)]

    return (train_dataframe, val_dataframe, test_dataframe)

(train_dataframe, val_dataframe, test_dataframe) = QUIC_dataset()



Loading data from dataloader


100%|██████████| 8162/8162 [00:12<00:00, 668.50it/s] 


Loading data from dataloader


100%|██████████| 192/192 [00:06<00:00, 31.67it/s]


Loading data from dataloader


100%|██████████| 957/957 [00:10<00:00, 93.41it/s] 


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

clf = RandomForestClassifier(max_depth=15, n_jobs=-1)

iters = 100_000

X = train_dataframe.drop(columns="APP").to_numpy()
y = train_dataframe["APP"].to_numpy()

X_test = test_dataframe.drop(columns="APP").to_numpy()[:200_000]
y_test = test_dataframe["APP"].to_numpy()[:200_000]

X_chosen = np.ndarray(shape=(iters, X.shape[1]))
y_chosen = np.ndarray(shape=(iters,))

ch_i = 400

X_chosen[:ch_i] = X[:ch_i]
y_chosen[:ch_i] = y[:ch_i]

arr = [0.1, 0.2, 0.4, 0.6]

for th in arr:
    ch_i = 400
    clf.fit(X_chosen[:ch_i], y_chosen[:ch_i])

    for i in range(400, iters, 400):
        probas = clf.predict_proba(X[i:i+400])
        for index, proba in enumerate(probas):
            top2 = np.sort(proba)[-2:][::-1]
            if (top2[0] - top2[1]) < th:
                X_chosen[ch_i] = X[i + index]
                y_chosen[ch_i] = y[i + index]
                ch_i += 1

        clf = RandomForestClassifier(max_depth=15, n_jobs=-1)
        clf.fit(X_chosen[:ch_i], y_chosen[:ch_i])

    predict_arr = clf.predict(X_test)
    print(ch_i)
    print(accuracy_score(y_test, predict_arr))

[0.515      0.07266667]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()