In [3]:
from cesnet_datazoo.datasets import CESNET_QUIC22
from cesnet_datazoo.config import DatasetConfig, AppSelection, ValidationApproach

In [4]:
dataset = CESNET_QUIC22("~/datasets/CESNET-QUIC22/", size="XS")

In [5]:
common_params = {
    "dataset": dataset,
    "apps_selection": AppSelection.ALL_KNOWN,
    "train_period_name": "W-2022-44",
    "val_approach": ValidationApproach.SPLIT_FROM_TRAIN,
    "train_val_split_fraction": 0.2,
    "use_packet_histograms": True,
}
dataset_config = DatasetConfig(**common_params)
dataset.set_dataset_config_and_initialize(dataset_config)
train_dataframe = dataset.get_train_df(flatten_ppi=True)
val_dataframe = dataset.get_val_df(flatten_ppi=True)
test_dataframe = dataset.get_test_df(flatten_ppi=True)

Loading data from dataloader


100%|██████████| 8162/8162 [00:08<00:00, 984.97it/s] 


Loading data from dataloader


100%|██████████| 192/192 [00:03<00:00, 49.10it/s]


Loading data from dataloader


100%|██████████| 1247/1247 [00:08<00:00, 151.31it/s]


In [6]:
print(train_dataframe.loc[0])

IPT_1             0.0
IPT_2             3.0
IPT_3             0.0
IPT_4             0.0
IPT_5           111.0
                ...  
IPT_BIN5_REV      1.0
IPT_BIN6_REV      0.0
IPT_BIN7_REV      0.0
IPT_BIN8_REV      2.0
APP              28.0
Name: 0, Length: 134, dtype: float64


In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
X = train_dataframe.drop(columns="APP").to_numpy()
y = train_dataframe["APP"].to_numpy()

clf = RandomForestClassifier()
clf.fit(X, y)

In [9]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, \
                            recall_score, confusion_matrix, classification_report
import numpy as np

X_test = test_dataframe.drop(columns="APP").to_numpy()
y_test = test_dataframe["APP"].to_numpy()

count = 0
test_len = len(X_test)
predict_arr = clf.predict(X_test[:test_len])

print(classification_report(y_test[:test_len], predict_arr, zero_division=np.nan))

# correct / incorrect
print(f"accuracy_score: {accuracy_score(y_test[:test_len], predict_arr):.4f}")
# a metric which takes into account true positives, false negatives / poisitives
print(f"f1_score: {f1_score(y_test[:test_len], predict_arr, average='weighted'):.4f}")
# how good the model is at not labeling negative samples as positive
print(f"precision: {precision_score(y_test[:test_len], predict_arr, average='weighted', zero_division=np.nan):.4f}")
# how good the model is at finding positive samples
print(f"recall: {recall_score(y_test[:test_len], predict_arr, average='weighted', zero_division=np.nan):.4f}")
# matrix with rows of class and the columns are the amount of classifications known to be in the 
# row class but predicted to be in column group meaning the diagonal should have the highest numbers
print(f"confusion matrix: {confusion_matrix(y_test[:test_len], predict_arr)}")

              precision    recall  f1-score   support

           0       0.66      0.25      0.36       490
           1       0.82      0.66      0.73       493
           2       0.85      0.27      0.41       190
           3       0.88      0.81      0.84      5422
           4       0.68      0.64      0.66      6043
           5       1.00      1.00      1.00     25570
           6       0.89      0.39      0.54       257
           7       0.93      0.48      0.63       547
           8       0.88      0.67      0.76      1154
           9       0.75      0.19      0.30       853
          10       0.92      0.76      0.83      1789
          11       0.59      0.51      0.55      5403
          12       0.98      0.85      0.91      5190
          13       0.72      0.81      0.76     21233
          14       0.83      0.96      0.89     27161
          15       0.99      0.98      0.98      4714
          16       0.81      0.70      0.75       350
          17       0.94    

In [10]:
test_periods = ["W-2022-45", "W-2022-46", "W-2022-47"]
print("\nTesting for later weeks")
print("------------------------")
for period in test_periods:
    print(f"\nPeriod: {period}")
    dataset_config = DatasetConfig(**common_params, test_period_name=period)
    dataset.set_dataset_config_and_initialize(dataset_config)
    test_dataframe = dataset.get_test_df(flatten_ppi=True)[:test_len]

    X_test = test_dataframe.drop(columns="APP").to_numpy()
    y_test = test_dataframe["APP"].to_numpy()

    predict_arr = clf.predict(X_test)
    print(f"accuracy_score: {accuracy_score(y_test[:test_len], predict_arr):.4f}")
    
    


Testing for later weeks
------------------------

Period: W-2022-45
Loading data from dataloader


100%|██████████| 1247/1247 [00:08<00:00, 151.67it/s]


accuracy_score: 0.8406

Period: W-2022-46
Loading data from dataloader


100%|██████████| 985/985 [00:07<00:00, 138.85it/s]


accuracy_score: 0.7810

Period: W-2022-47
Loading data from dataloader


100%|██████████| 1289/1289 [00:08<00:00, 148.62it/s]


accuracy_score: 0.7744
