In [1]:
from cesnet_datazoo.datasets import CESNET_QUIC22
from cesnet_datazoo.config import DatasetConfig, AppSelection, ValidationApproach

dataset = CESNET_QUIC22("~/datasets/CESNET-QUIC22/", size="XS")

common_params = {
    "dataset": dataset,
    "apps_selection": AppSelection.ALL_KNOWN,
    "train_period_name": "W-2022-44",
    "val_approach": ValidationApproach.SPLIT_FROM_TRAIN,
    "train_val_split_fraction": 0.2,
    "use_packet_histograms": True,
}
dataset_config = DatasetConfig(**common_params)
dataset.set_dataset_config_and_initialize(dataset_config)
train_dataframe = dataset.get_train_df(flatten_ppi=True)
val_dataframe = dataset.get_val_df(flatten_ppi=True)
test_dataframe = dataset.get_test_df(flatten_ppi=True)

Loading data from dataloader


100%|██████████| 8162/8162 [00:07<00:00, 1023.94it/s]


Loading data from dataloader


100%|██████████| 192/192 [00:03<00:00, 48.48it/s]


Loading data from dataloader


100%|██████████| 1247/1247 [00:08<00:00, 146.86it/s]


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

X_train = train_dataframe.drop(columns="APP").to_numpy()
y_train = train_dataframe["APP"].to_numpy()

# parameters = {
#     'min_samples_split': [10, 15, 20],
#     'min_samples_leaf': [3, 5, 8],
#     'max_leaf_nodes': [25, 40],
#     'max_features': [134]
# }

# model = DecisionTreeClassifier()

# clf = GridSearchCV(model, parameters)

# clf.fit(X_train, y_train)

# print(clf.best_estimator_)

In [38]:
import numpy as np
from scipy.stats import mode
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import BaggingClassifier

X_test = test_dataframe.drop(columns="APP").to_numpy()[:10000]
y_test = test_dataframe["APP"].to_numpy()[:10000]


def create_balanced_test_data():
    grouped = test_dataframe.groupby('APP')

    X_arr = np.ndarray(shape = (10100, X_train.shape[1]))
    y_arr = np.ndarray(shape = (10100,))

    for index, i in enumerate(grouped):
        X_temp = i[1].drop(columns="APP").to_numpy()
        y_temp = i[1]["APP"].to_numpy()

        X_arr[index*100:(index*100)+100] = X_temp[:100]
        y_arr[index*100:(index*100)+100] = y_temp[:100]

    return (X_arr, y_arr)

(X_bal, y_bal) = create_balanced_test_data()

class Forest():
    def initialize(self, s_per_tree, tree_params, max_trees):
        self.max_trees = max_trees
        self.count = 0
        self.tree_params = tree_params
        self.clf_arr = []
        self.clf_eval = np.empty(self.max_trees, dtype=float)
        self.s_tree = s_per_tree
        self.last_tree = -1
        self.second_phase = False
        self.tree_iter = 0

    def get_tree_eval(self, index):
        predict_arr = self.clf_arr[index].predict(X_bal)
        return f1_score(y_bal, predict_arr, average="weighted")

    def prune_tree(self):
        min = 100
        min_i = 0
        for i, val in enumerate(self.clf_eval):
            if val < min:
                min = val
                min_i = i

        self.clf_arr[min_i] = DecisionTreeClassifier(**self.tree_params)

        return min_i

    def add(self):
        if not self.count % self.s_tree:
            self.tree_iter += 1

            if self.count != 0:
                self.clf_eval[self.last_tree] = self.get_tree_eval(self.last_tree)

            if self.last_tree + 1 >= self.max_trees or self.second_phase == True:
                self.second_phase = True
                self.last_tree = self.prune_tree()
            else:
                self.clf_arr.append(DecisionTreeClassifier(**self.tree_params))
                self.last_tree += 1

        index_from = self.tree_iter * self.s_tree
        index_to = (self.count % self.s_tree) + index_from + 1

        self.count += 1

        self.clf_arr[self.last_tree].fit(X_train[index_from:index_to], y_train[index_from:index_to])

    def eval(self):
        predictions = np.ndarray(shape = (len(self.clf_arr), len(X_test)))

        for index, tree in enumerate(self.clf_arr):
            predictions[index] = tree.predict(X_test)

        stacked_predictions = np.stack(predictions, axis=0)
        predict_arr = mode(stacked_predictions, axis=0).mode

        return f1_score(y_test, predict_arr, average="weighted")
    
params = {
    "max_features": 133,
    "max_leaf_nodes": 25,
    "min_samples_leaf": 3,
    "min_samples_split": 10
}

f = Forest()
f.initialize(100, params, 100)

for i in range(100000):
    f.add()
    # print(f"{i}")
    if i % 100 == 0:
        print(f"{i} {f.eval()}")

print(f"{i} {f.eval()}")



0 0.0007085377821393523
100 0.20030790745489713
200 0.2076075373486598
300 0.24309475780724407
400 0.25508903432100977
500 0.2578902759563432
600 0.26430205107578597
700 0.26808983832261546
800 0.30440077907572194
900 0.2981984633172763
1000 0.2937273194309538
1100 0.3016498291772848
1200 0.2932905024527232
1300 0.3007185773307761
1400 0.3087756797488525
1500 0.31297519296190907
1600 0.3225152619536853
1700 0.3210950637717273
1800 0.32398719478064764
1900 0.32527377164643917
2000 0.3307764831792211
2100 0.3292328679932558
2200 0.32767889969576036
2300 0.3324371242474167
2400 0.33516875574669874
2500 0.34348921402826144
2600 0.34170026263561415
2700 0.33722803885071073
2800 0.342692095080785
2900 0.3446022625983693
3000 0.34310768823938925
3100 0.3470640859932813
3200 0.34380733423534837
3300 0.35038079299140645
3400 0.34721306466861185
3500 0.3507774130893352
3600 0.3481191851503808
3700 0.3508943978891847
3800 0.35015957979364404
3900 0.3487094321877655
4000 0.34858687921193016
4100 0

KeyboardInterrupt: 