In [125]:
from cesnet_datazoo.datasets import CESNET_QUIC22
from cesnet_datazoo.config import DatasetConfig, AppSelection, ValidationApproach

dataset = CESNET_QUIC22("~/datasets/CESNET-QUIC22/", size="XS")

common_params = {
    "dataset": dataset,
    "apps_selection": AppSelection.ALL_KNOWN,
    "train_period_name": "W-2022-44",
    "val_approach": ValidationApproach.SPLIT_FROM_TRAIN,
    "train_val_split_fraction": 0.2,
    "use_packet_histograms": True,
}
dataset_config = DatasetConfig(**common_params)
dataset.set_dataset_config_and_initialize(dataset_config)
train_dataframe = dataset.get_train_df(flatten_ppi=True)
val_dataframe = dataset.get_val_df(flatten_ppi=True)
test_dataframe = dataset.get_test_df(flatten_ppi=True)

Loading data from dataloader


100%|██████████| 8162/8162 [00:14<00:00, 563.64it/s] 


Loading data from dataloader


100%|██████████| 192/192 [00:05<00:00, 32.58it/s]


Loading data from dataloader


100%|██████████| 1247/1247 [00:12<00:00, 96.22it/s] 


In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from numpy.random import Generator
import numpy as np
import random as rd

grouped = train_dataframe.groupby('APP')

X_arr = []
y_arr = []

for i in grouped:
    X_arr.append(i[1].drop(columns="APP").to_numpy())
    y_arr.append(i[1]["APP"].to_numpy())

def create_training_data(length, index):
    count = min(X_arr[index].shape[0], int(length/2))
    rng = np.random.default_rng()
    correct_subset = rng.choice(X_arr[index].shape[0], size=(count), replace=False)

    correct_subset = X_arr[index][correct_subset]

    X = np.ndarray(shape=(length, 133))
    y = np.ndarray(shape=(length,))

    for i in range(count):
        X[i] = X_arr[index][i]
        y[i] = 1

    for i in range(count, length):
        randint = rd.randint(0, 100)
        while randint == index:
            randint = rd.randint(0, 100)

        X[i] = X_arr[randint][rd.randint(0, X_arr[randint].shape[0]-1)]
        y[i] = 0

    # shuffle them so the indexes still match
    permutation = np.random.permutation(length)
    X = X[permutation]
    y = y[permutation]

    return (X, y)

def create_50_training_data(length, index, nclass):
    length_total = length * nclass
    # rng = np.random.default_rng()
    # correct_subset = rng.choice(X_arr[index].shape[0], size=(count), replace=False)

    # correct_subset = X_arr[index][correct_subset]

    X = np.ndarray(shape=(length_total, 133))
    y = np.ndarray(shape=(length_total,))

    for i in range(length):
        X[i] = X_arr[index][i]
        y[i] = 1

    for c in range(nclass):
        if c == index:
            continue

        for i in range(length, length_total):
            X[i] = X_arr[c][rd.randint(0, X_arr[c].shape[0]-1)]
            y[i] = 0

    # shuffle them so the indexes still match
    permutation = np.random.permutation(length_total)
    X = X[permutation]
    y = y[permutation]

    return (X, y)

def get_predict(clf):
    y_binary = np.ndarray(shape=(100000,))
    for test_i in range(100000):
        if y_test[test_i] == i:
            y_binary[test_i] = 1
        else:
            y_binary[test_i] = 0

    predict_arr = clf.predict(X_test[:100000])

    return (predict_arr, y_binary)

X_test = test_dataframe.drop(columns="APP").to_numpy()
y_test = test_dataframe["APP"].to_numpy()
clf_arr = []

print(len(X_arr))

for i in range(len(X_arr)):
    clf = RandomForestClassifier()
    clf_bal = RandomForestClassifier()
    (X, y) = create_training_data(10000, i)
    clf.fit(X, y)

    (predict_arr, y_binary) = get_predict(clf)

    print(f"accuracy: {accuracy_score(y_binary, predict_arr):.4f}")
    print(f"precision_score: {precision_score(y_binary, predict_arr):.4f}\n")

    clf_arr.append(clf)


101
accuracy: 0.9999
precision_score: 0.2857

accuracy: 0.9998
precision_score: 0.5926

accuracy: 0.9999
precision_score: 0.2500

accuracy: 0.9947
precision_score: 0.2021

accuracy: 0.9774
precision_score: 0.0535

accuracy: 1.0000
precision_score: 0.9972

accuracy: 0.9998
precision_score: 0.3182

accuracy: 0.9995
precision_score: 0.1395

accuracy: 0.9997
precision_score: 0.3261

accuracy: 0.9928
precision_score: 0.0223

accuracy: 0.9986
precision_score: 0.3575

accuracy: 0.9774
precision_score: 0.0538

accuracy: 0.9984
precision_score: 0.6481

accuracy: 0.9848
precision_score: 0.2490

accuracy: 0.9883
precision_score: 0.4864

accuracy: 0.9999
precision_score: 0.9796

accuracy: 0.9999
precision_score: 0.4444

accuracy: 0.9934
precision_score: 0.7226

accuracy: 0.9981
precision_score: 0.4429

accuracy: 0.9932
precision_score: 0.8599

accuracy: 0.9992
precision_score: 0.2752

accuracy: 0.9983
precision_score: 0.1921

accuracy: 0.9998
precision_score: 0.7934

accuracy: 0.9999
precision_sco

In [128]:
predict_arrays = []

X = train_dataframe.drop(columns="APP").to_numpy()
y = train_dataframe["APP"].to_numpy()

for i in range(len(clf_arr)):
    predict_arrays.append(clf_arr[i].predict(X))

In [136]:
total_count = 0
for i in range(len(X)):
    count = 0
    for j in range(len(predict_arrays)):
        if predict_arrays[j][i] == 1:
            count += 1

    if count > 7 or count == 0:
        total_count += 1

nclass = len(X_arr)

base_sample_count = 0
for c in range(nclass):
    base_sample_count += min(200, X_arr[c].shape[0])

X_pruned = np.ndarray(shape = (total_count + base_sample_count, X.shape[1]))
y_pruned = np.ndarray(shape = (total_count + base_sample_count,))

total_count = 0
for c in range(nclass):
    for i in range(min(200, X_arr[c].shape[0])):
        X_pruned[total_count] = X_arr[c][i]
        y_pruned[total_count] = y_arr[c][i]
        total_count += 1

# total_count = 200*nclass
for i in range(len(X)):
    count = 0
    for j in range(len(predict_arrays)):
        if predict_arrays[j][i] == 1:
            count += 1
    
    if count > 7 or count == 0:
        X_pruned[total_count] = X[i]
        y_pruned[total_count] = y[i]
        total_count += 1

clf = RandomForestClassifier()
clf.fit(X_pruned, y_pruned)

In [137]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, \
                            recall_score, confusion_matrix, classification_report

predict_arr = clf.predict(X_test)

In [138]:
print(classification_report(y_test, predict_arr, zero_division=np.nan))

# correct / incorrect
print(f"accuracy_score: {accuracy_score(y_test, predict_arr):.4f}")
# a metric which takes into account true positives, false negatives / poisitives
print(f"f1_score: {f1_score(y_test, predict_arr, average='weighted'):.4f}")
# how good the model is at not labeling negative samples as positive
print(f"precision: {precision_score(y_test, predict_arr, average='weighted', zero_division=np.nan):.4f}")
# how good the model is at finding positive samples
print(f"recall: {recall_score(y_test, predict_arr, average='weighted', zero_division=np.nan):.4f}")
# matrix with rows of class and the columns are the amount of classifications known to be in the 
# row class but predicted to be in column group meaning the diagonal should have the highest numbers
print(f"confusion matrix: {confusion_matrix(y_test, predict_arr)}")

              precision    recall  f1-score   support

           0       0.13      0.56      0.21       490
           1       0.26      0.81      0.39       493
           2       0.07      0.56      0.12       190
           3       0.60      0.63      0.62      5422
           4       0.43      0.29      0.35      6043
           5       0.99      0.97      0.98     25570
           6       0.05      0.78      0.09       257
           7       0.07      0.77      0.12       547
           8       0.39      0.63      0.48      1154
           9       0.24      0.24      0.24       853
          10       0.20      0.71      0.31      1789
          11       0.42      0.11      0.17      5403
          12       0.78      0.85      0.81      5190
          13       0.55      0.43      0.49     21233
          14       0.73      0.73      0.73     27161
          15       0.92      0.98      0.95      4714
          16       0.28      0.87      0.42       350
          17       0.94    