# Analysis of datasets used in this work
#### Author: Jakub Čoček (xcocek00)

In [21]:
# -- IMPORTS --

import os
os.chdir('/workplace/xcocek00/common/')
from dataset_analysis import (
    count_flows,
    count_flows_filtered,
    calculate_avrg_length,
    calculate_avrg_length_filtered,
    calculate_avrg_num_pkts,
)

# sets csv limit
import csv
import sys
csv.field_size_limit(sys.maxsize)

9223372036854775807


---

### Mirage19

In [22]:
# datasets
train = "/workplace/datasets/mirage19/processed/splits/train.csv"
val = "/workplace/datasets/mirage19/processed/splits/val.csv"
test = "/workplace/datasets/mirage19/processed/splits/test.csv"

# flows
print(f"[train] Total number of flows: {count_flows(train)}")
print(f"[val] Total number of flows: {count_flows(val)}")
print(f"[test] Total number of flows: {count_flows(test)}")

print("")

print(f"[train, >30pkts] Total number of flows: {count_flows_filtered(train, 30)}")
print(f"[val, >30pkts] Total number of flows: {count_flows_filtered(val, 30)}")
print(f"[test, >30pkts] Total number of flows: {count_flows_filtered(test, 30)}")

print("---------------------------------")

# length
print(f"[train] Average length: {calculate_avrg_length(train):.2f}s")
print(f"[val] Average length: {calculate_avrg_length(val):.2f}s")
print(f"[test] Average length: {calculate_avrg_length(test):.2f}s")

print("")

print(f"[train, >30pkts] Average length: {calculate_avrg_length_filtered(train, 30):.2f}s")
print(f"[val, >30pkts] Average length: {calculate_avrg_length_filtered(val, 30):.2f}s")
print(f"[test, >30pkts] Average length: {calculate_avrg_length_filtered(test, 30):.2f}s")

print("---------------------------------")

# avrg pkts
print(f"[train] Average number of pkts: {calculate_avrg_num_pkts(train, 0):.2f}")
print(f"[val] Average number of pkts: {calculate_avrg_num_pkts(val, 0):.2f}")
print(f"[test] Average number of pkts: {calculate_avrg_num_pkts(test, 0):.2f}")

print("")

print(f"[train, >30pkts] Average number of pkts: {calculate_avrg_num_pkts(train, 30):.2f}")
print(f"[val, >30pkts] Average number of pkts: {calculate_avrg_num_pkts(val, 30):.2f}")
print(f"[test, >30pkts] Average number of pkts: {calculate_avrg_num_pkts(test, 30):.2f}")

[train] Total number of flows: 51978
[val] Total number of flows: 5776
[test] Total number of flows: 6418

[train, >30pkts] Total number of flows: 38959
[val, >30pkts] Total number of flows: 4402
[test, >30pkts] Total number of flows: 4818
---------------------------------
[train] Average length: 28.62s
[val] Average length: 28.40s
[test] Average length: 26.66s

[train, >30pkts] Average length: 20.52s
[val, >30pkts] Average length: 20.72s
[test, >30pkts] Average length: 19.18s
---------------------------------
[train] Average number of pkts: 30.33
[val] Average number of pkts: 30.43
[test] Average number of pkts: 30.33

[train, >30pkts] Average number of pkts: 31.89
[val, >30pkts] Average number of pkts: 31.89
[test, >30pkts] Average number of pkts: 31.89


---

### Mirage22

In [23]:
# datasets (>10pkts)
train = "/workplace/datasets/mirage22/processed/splits-10/train.csv"
val = "/workplace/datasets/mirage22/processed/splits-10/val.csv"
test = "/workplace/datasets/mirage22/processed/splits-10/test.csv"

# flows
print(f"[train, >10pkts] Total number of flows: {count_flows(train)}")
print(f"[val, >10pkts] Total number of flows: {count_flows(val)}")
print(f"[test, >10pkts] Total number of flows: {count_flows(test)}")

print("---------------------------------")

# length
print(f"[train, >10pkts] Average length: {calculate_avrg_length_filtered(train, 10):.2f}s")
print(f"[val, >10pkts] Average length: {calculate_avrg_length_filtered(val, 10):.2f}s")
print(f"[test, >10pkts] Average length: {calculate_avrg_length_filtered(test, 10):.2f}s")

print("---------------------------------")

# avrg pkts
print(f"[train, >10pkts] Average number of pkts: {calculate_avrg_num_pkts(train, 10):.2f}")
print(f"[val, >10pkts] Average number of pkts: {calculate_avrg_num_pkts(val, 10):.2f}")
print(f"[test, >10pkts] Average number of pkts: {calculate_avrg_num_pkts(test, 10):.2f}")

print("")
print("---------------------------------")
print("---------------------------------")
print("")

# datasets (>1000pkts)
train = "/workplace/datasets/mirage22/processed/splits-1000/train.csv"
val = "/workplace/datasets/mirage22/processed/splits-1000/val.csv"
test = "/workplace/datasets/mirage22/processed/splits-1000/test.csv"

# flows
print(f"[train, >1000pkts] Total number of flows: {count_flows(train)}")
print(f"[val, >1000pkts] Total number of flows: {count_flows(val)}")
print(f"[test, >1000pkts] Total number of flows: {count_flows(test)}")

print("---------------------------------")

# length
print(f"[train, >1000pkts] Average length: {calculate_avrg_length_filtered(train, 1000):.2f}s")
print(f"[val, >1000pkts] Average length: {calculate_avrg_length_filtered(val, 1000):.2f}s")
print(f"[test, >1000pkts] Average length: {calculate_avrg_length_filtered(test, 1000):.2f}s")

print("---------------------------------")

# avrg pkts
print(f"[train, >1000pkts] Average number of pkts: {calculate_avrg_num_pkts(train, 100):.2f}")
print(f"[val, >1000pkts] Average number of pkts: {calculate_avrg_num_pkts(val, 1000):.2f}")
print(f"[test, >1000pkts] Average number of pkts: {calculate_avrg_num_pkts(test, 1000):.2f}")


[train, >10pkts] Total number of flows: 21685
[val, >10pkts] Total number of flows: 2410
[test, >10pkts] Total number of flows: 2678
---------------------------------
[train, >10pkts] Average length: 333.28s
[val, >10pkts] Average length: 316.36s
[test, >10pkts] Average length: 309.81s
---------------------------------
[train, >10pkts] Average number of pkts: 6708.65
[val, >10pkts] Average number of pkts: 6370.42
[test, >10pkts] Average number of pkts: 7258.81

---------------------------------
---------------------------------

[train, >1000pkts] Total number of flows: 3700
[val, >1000pkts] Total number of flows: 412
[test, >1000pkts] Total number of flows: 457
---------------------------------
[train, >1000pkts] Average length: 614.51s
[val, >1000pkts] Average length: 617.08s
[test, >1000pkts] Average length: 588.74s
---------------------------------
[train, >1000pkts] Average number of pkts: 39466.81
[val, >1000pkts] Average number of pkts: 37671.55
[test, >1000pkts] Average number 

---

### Ucdavis

In [27]:
# datasets
pretraining = "/workplace/datasets/ucdavis/final-splits/pretraining.csv"
val = "/workplace/datasets/ucdavis/final-splits/val.csv"
human = "/workplace/datasets/ucdavis/final-splits/human.csv"
script = "/workplace/datasets/ucdavis/final-splits/script.csv"

# flows
print(f"[pretraining] Total number of flows: {count_flows(pretraining)}")
print(f"[val] Total number of flows: {count_flows(val)}")
print(f"[human] Total number of flows: {count_flows(human)}")
print(f"[script] Total number of flows: {count_flows(script)}")

print("---------------------------------")

# length
print(f"[pretraining] Average length: {calculate_avrg_length_filtered(pretraining, 0):.2f}s")
print(f"[val] Average length: {calculate_avrg_length_filtered(val, 0):.2f}s")
print(f"[human] Average length: {calculate_avrg_length_filtered(human, 0):.2f}s")
print(f"[script] Average length: {calculate_avrg_length_filtered(script, 0):.2f}s")

print("---------------------------------")

# avrg pkts
print(f"[pretraining] Average number of pkts: {calculate_avrg_num_pkts(pretraining, 0):.2f}")
print(f"[val] Average number of pkts: {calculate_avrg_num_pkts(val, 0):.2f}")
print(f"[human] Average length: {calculate_avrg_num_pkts(human, 0):.2f}")
print(f"[script] Average length: {calculate_avrg_num_pkts(script, 0):.2f}")

[pretraining] Total number of flows: 6439
[val] Total number of flows: 1288
[human] Total number of flows: 83
[script] Total number of flows: 150
---------------------------------
[pretraining] Average length: 42.92s
[val] Average length: 43.50s
[human] Average length: 31.33s
[script] Average length: 42.94s
---------------------------------
[pretraining] Average number of pkts: 6652.93
[val] Average number of pkts: 6811.09
[human] Average length: 7666.37
[script] Average length: 7130.64


---

### UTMobileNetTraffic21

In [30]:
# datasets
train = "/workplace/datasets/utmobilenet21/final-splits/train.csv"
val = "/workplace/datasets/utmobilenet21/final-splits/val.csv"
test = "/workplace/datasets/utmobilenet21/final-splits/test.csv"

# flows
print(f"[train] Total number of flows: {count_flows(train)}")
print(f"[val] Total number of flows: {count_flows(val)}")
print(f"[test] Total number of flows: {count_flows(test)}")

print("")

print(f"[train, >30pkts] Total number of flows: {count_flows_filtered(train, 30)}")
print(f"[val, >30pkts] Total number of flows: {count_flows_filtered(val, 30)}")
print(f"[test, >30pkts] Total number of flows: {count_flows_filtered(test, 30)}")

print("---------------------------------")

# length
print(f"[train] Average length: {calculate_avrg_length(train):.2f}s")
print(f"[val] Average length: {calculate_avrg_length(val):.2f}s")
print(f"[test] Average length: {calculate_avrg_length(test):.2f}s")

print("")

print(f"[train, >30pkts] Average length: {calculate_avrg_length_filtered(train, 30):.2f}s")
print(f"[val, >30pkts] Average length: {calculate_avrg_length_filtered(val, 30):.2f}s")
print(f"[test, >30pkts] Average length: {calculate_avrg_length_filtered(test, 30):.2f}s")

print("---------------------------------")

# avrg pkts
print(f"[train] Average number of pkts: {calculate_avrg_num_pkts(train, 0):.2f}")
print(f"[val] Average number of pkts: {calculate_avrg_num_pkts(val, 0):.2f}")
print(f"[test] Average number of pkts: {calculate_avrg_num_pkts(test, 0):.2f}")

print("")

print(f"[train, >30pkts] Average number of pkts: {calculate_avrg_num_pkts(train, 30):.2f}")
print(f"[val, >30pkts] Average number of pkts: {calculate_avrg_num_pkts(val, 30):.2f}")
print(f"[test, >30pkts] Average number of pkts: {calculate_avrg_num_pkts(test, 30):.2f}")

[train] Total number of flows: 7568
[val] Total number of flows: 946
[test] Total number of flows: 946

[train, >30pkts] Total number of flows: 5028
[val, >30pkts] Total number of flows: 656
[test, >30pkts] Total number of flows: 648
---------------------------------
[train] Average length: 13.47s
[val] Average length: 67.40s
[test] Average length: 1.59s

[train, >30pkts] Average length: 18.23s
[val, >30pkts] Average length: 93.25s
[test, >30pkts] Average length: 1.05s
---------------------------------
[train] Average number of pkts: 2537.27
[val] Average number of pkts: 1554.43
[test] Average number of pkts: 1810.65

[train, >30pkts] Average number of pkts: 3808.52
[val, >30pkts] Average number of pkts: 2232.41
[test, >30pkts] Average number of pkts: 2633.73
