## VNET Data Preprocessing Test
This notebook was used for developing and testing VNET data preprocessing, after finishing, its contents have been copied to `vnet_preprocessing.py` file.

In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict

pd.set_option('display.max_columns', None)

In [2]:
%cd ..

c:\Users\Goldy\Desktop\PRO projekty\workstuff\goldschmidt-playground\vnetwindow


In [3]:
data = pd.read_csv('data/48.flows', delimiter='|', header='infer')
windows = pd.read_csv('data/windows.csv')

In [4]:
data.head()

Unnamed: 0,IN_BYTES,IN_PKTS,PROTOCOL,TCP_FLAGS,L4_SRC_PORT,IPV4_SRC_ADDR,IPV6_SRC_ADDR,L4_DST_PORT,IPV4_DST_ADDR,IPV6_DST_ADDR,OUT_BYTES,OUT_PKTS,MIN_IP_PKT_LEN,MAX_IP_PKT_LEN,ICMP_TYPE,MIN_TTL,MAX_TTL,DIRECTION,FLOW_START_MILLISECONDS,FLOW_END_MILLISECONDS,SRC_FRAGMENTS,DST_FRAGMENTS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,SRC_TO_DST_AVG_THROUGHPUT,DST_TO_SRC_AVG_THROUGHPUT,NUM_PKTS_UP_TO_128_BYTES,NUM_PKTS_128_TO_256_BYTES,NUM_PKTS_256_TO_512_BYTES,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,NUM_PKTS_OVER_1514_BYTES,LONGEST_FLOW_PKT,SHORTEST_FLOW_PKT,RETRANSMITTED_IN_PKTS,RETRANSMITTED_OUT_PKTS,OOORDER_IN_PKTS,OOORDER_OUT_PKTS,DURATION_IN,DURATION_OUT,TCP_WIN_MIN_IN,TCP_WIN_MAX_IN,TCP_WIN_MSS_IN,TCP_WIN_SCALE_IN,TCP_WIN_MIN_OUT,TCP_WIN_MAX_OUT,TCP_WIN_MSS_OUT,TCP_WIN_SCALE_OUT,FLOW_VERDICT,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,APPLICATION_ID
0,32,1,17,0,46963,217.73.28.78,::,10001,18.195.24.157,::,0,0,0,32,0,0,0,1,1660934932096,1660934932096,0,0,0,0,256000,0,1,0,0,0,0,0,32,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,369099017
1,52,1,6,16,80,81.89.49.98,::,9594,185.191.171.24,::,0,0,0,52,0,0,0,1,1660934932096,1660934932096,0,0,16,0,416000,0,1,0,0,0,0,0,52,52,0,0,0,0,0,0,235,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,369098759
2,64,1,17,0,58825,81.89.56.212,::,53,8.8.8.8,::,0,0,0,64,0,0,0,1,1660934932096,1660934932096,0,0,0,0,512000,0,1,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,369098878
3,531,1,17,0,30300,86.110.248.222,::,30300,152.44.35.66,::,0,0,0,531,0,0,0,1,1660934932097,1660934932097,0,0,0,0,4248000,0,0,0,0,1,0,0,531,531,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,369098752
4,52,1,6,16,9001,46.229.238.172,::,34240,162.19.64.52,::,0,0,0,52,0,0,0,1,1660934932097,1660934932097,0,0,16,0,416000,0,1,0,0,0,0,0,52,52,0,0,0,0,0,0,501,501,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,369098752


In [62]:
data['DIRECTION'].value_counts()

0    142765
1     43650
Name: DIRECTION, dtype: int64

### Routines For Data Preprocessing

In [52]:
### Constants definitions
# Features not directly used for classification - provide additional information, but drop them
FEATURES_DROP = ['IPV4_SRC_ADDR', 'IPV6_SRC_ADDR',  # IPs are not directly useful, only as metadata
    'IPV4_DST_ADDR', 'IPV6_DST_ADDR',               # IPs are not directly useful, only as metadata
    'DIRECTION',                                    # Not useful for intrusion detection purposes
    'FLOW_VERDICT',                                 # These are all 0s
    'APPLICATION_ID'                                # Interesting metdata, but we cannot find an use for ID
    ]

# Features which were processed in the preprocessing function and are no longer needed for classification
FEATURES_DROP_PROCESSED = ['TCP_FLAGS', 'FLOW_START_MILLISECONDS', 'FLOW_END_MILLISECONDS', 'CLIENT_TCP_FLAGS',
    'SERVER_TCP_FLAGS']

# Categorical values to be encoded automatically
FEATURES_CATEGORICAL = ['PROTOCOL', 'L4_SRC_PORT', 'L4_DST_PORT']

# Numerical variables which should be scaled
FEATURES_NUMERICAL = ['IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS', 'MIN_IP_PKT_LEN', 'MAX_IP_PKT_LEN',
    'MIN_TTL', 'MAX_TTL', 'SRC_FRAGMENTS', 'DST_FRAGMENTS', 'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
    'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES', 'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
    'NUM_PKTS_1024_TO_1514_BYTES', 'NUM_PKTS_OVER_1514_BYTES', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT',
    'RETRANSMITTED_IN_PKTS', 'RETRANSMITTED_OUT_PKTS', 'OOORDER_IN_PKTS', 'OOORDER_OUT_PKTS', 'DURATION_IN',
    'DURATION_OUT', 'TCP_WIN_MIN_IN', 'TCP_WIN_MAX_IN', 'TCP_WIN_MSS_IN', 'TCP_WIN_SCALE_IN', 'TCP_WIN_MIN_OUT',
    'TCP_WIN_MAX_OUT', 'TCP_WIN_MSS_OUT', 'TCP_WIN_SCALE_OUT', 'SRC_TO_DST_IAT_MIN', 'SRC_TO_DST_IAT_MAX',
    'SRC_TO_DST_IAT_AVG', 'SRC_TO_DST_IAT_STDDEV', 'DST_TO_SRC_IAT_MIN', 'DST_TO_SRC_IAT_MAX', 'DST_TO_SRC_IAT_AVG',
    'DST_TO_SRC_IAT_STDDEV']

FEATURES_NUMERICAL_COMPUTED = ['IN_BPS', 'IN_BPP', 'IN_PPS', 'OUT_BPS', 'OUT_BPP', 'OUT_PPS']

# Protocol to text data default dictionary
PROTO_MAPPER = defaultdict(lambda : 'OTHER', {
    1   : 'ICMP',
    6   : 'TCP',
    17  : 'UDP',
    47  : 'GRE',
    50  : 'ESP', 
    58  : 'ICMP6'
})

# Port categorization
# Sources:
#    - https://opensource.com/article/18/10/common-network-ports
#    - https://hostpapasupport.com/commonly-used-ports/
#    - https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers
#    - https://www.networkinghowtos.com/howto/common-vpn-ports-and-protocols
#    - https://www.rfc-editor.org/rfc/rfc7194.html 
#    - https://en.wikipedia.org/wiki/Denial-of-service_attack#Amplification
#    - https://manuals.playstation.net/document/en/psvita/psn/firewall.html
#    - https://www.speedguide.net/ports.php

PCAT_NOPORT      = {0}                                                           # No TCP/UDP port used
PCAT_NOENC_WEB   = {80, 8080}                                                    # Web traffic
PCAT_TLS         = {443}                                                         # TLS traffic, probably web
PCAT_DNS         = {53}                                                          # Separate category due to traffic volume
PCAT_EMAIL       = {25, 26, 109, 110, 143, 209, 218, 220, 587, 993, 2095, 2096}  # Email traffic
PCAT_VPN         = {500, 1194, 1701, 1723, 4500}                                 # VPNs like L2TP, IPSec, OpenVPN...
PCAT_DATA        = {20, 21, 22, 69, 115, 989, 990, 2077, 2078}                   # Data transfer FTP, STFP, TFTP
PCAT_SHELL       = {22, 23, 513, 514}                                            # Shell services - SSH, RSH,...
PCAT_PLAYSTATION = {465, 3478, 3479, 3480}                                       # Playstation Network
PCAT_CHAT        = {194, 517, 518, 2351, 6667, 6697}                             # IRC chat
PCAT_QUERY       = {17, 19, 123, 1900, 3283, 4462, 4463, 5683, 6881, 6882,       # Query-response for amplification
                    6883, 6883, 6884, 6885, 6886, 6887, 6888, 6889, 11211, 26000}

# In order to make port categorization computationally easier, create a search dict from the above sets
PCAT_SEARCHDICT = defaultdict(lambda: 'OTHER', dict.fromkeys(PCAT_NOENC_WEB, 'WEB_NOENC') | dict.fromkeys(PCAT_TLS, 'TLS') | \
    dict.fromkeys(PCAT_DNS, 'DNS') | dict.fromkeys(PCAT_EMAIL, 'EMAIL') | dict.fromkeys(PCAT_VPN, 'VPN') | \
    dict.fromkeys(PCAT_DATA, 'DATA') | dict.fromkeys(PCAT_SHELL, 'SHELL') | \
    dict.fromkeys(PCAT_PLAYSTATION, 'PLAYSTATION') | dict.fromkeys(PCAT_CHAT, 'CHAT') | dict.fromkeys(PCAT_QUERY, 'QUERY'))

In [56]:
from sklearn.preprocessing import MinMaxScaler

### Data preprocessing functions
def map_tcp_flags(tcp_flags_ser : pd.Series, col_prefix: str) -> pd.DataFrame: 
    """Map a series of TCP flags into DataFrame, with most important flags parsed into separate columns."""
    
    col_prefix = col_prefix + '_' if col_prefix else col_prefix
    colnames = [col_prefix + 'FLAG_' + colname for colname in ['FIN', 'SYN', 'RST', 'PSH', 'ACK', 'URG', 'ECE', 'CWR']]
    deenc_flags_df = pd.DataFrame(np.zeros((len(tcp_flags_ser), 8), dtype=np.int8), columns=colnames)

    for row_idx in range(0, len(tcp_flags_ser)):
        for flag_idx in range(8):
            flag_value = 1 << flag_idx
            if (tcp_flags_ser.iloc[row_idx] & flag_value) != 0:
                deenc_flags_df.iloc[row_idx, flag_idx] = 1

    return deenc_flags_df


def categorize_port(port : int) -> str:
    """Performs port categorization into one of 12 pre-specified groups"""

    portcat = PCAT_SEARCHDICT[port]

    # Perform correction to other special, dynamic port category
    # Dynamic ports are defined from 49152 to 65535, since ports are 16-bit, upper bound is not necessary
    if portcat == 'OTHER' and port >= 49152:
        portcat == 'DYNAMIC'

    return portcat


def vnet_preprocess(data : pd.DataFrame) -> pd.DataFrame:
    """Preprocesses dataset exported by nProbe into ML-acceptable form."""
    data = data.copy(deep=True)

    # Compute "normalized" flow durations for each side in seconds or future fields computation
    # This means that if the duration is 0 (0 or 1 packet sent), it it set to 1ms to allow divisions
    data['IN_DUR_S']  = data['DURATION_IN'].apply(lambda x : x / 1000.0 if x > 0 else 0.001)
    data['OUT_DUR_S'] = data['DURATION_OUT'].apply(lambda x : x / 1000.0 if x > 0 else 0.001)

    # Map protocols into their respective names to limit their number for one-hot encoding
    data['PROTOCOL'] = data['PROTOCOL'].apply(lambda x : PROTO_MAPPER[x])

    # Map TCP flags
    data = pd.concat([data, map_tcp_flags(data['TCP_FLAGS'], '')], axis=1)
    data = pd.concat([data, map_tcp_flags(data['CLIENT_TCP_FLAGS'], 'CLIENT')], axis=1)
    data = pd.concat([data, map_tcp_flags(data['SERVER_TCP_FLAGS'], 'SERVER')], axis=1)

    # Perform port categorization
    data['L4_SRC_PORT'] = data['L4_SRC_PORT'].apply(categorize_port)
    data['L4_DST_PORT'] = data['L4_DST_PORT'].apply(categorize_port)

    # Decode ICMP types and codes
    data['ICMP_CODE'] = data['ICMP_TYPE'] % 256
    data['ICMP_TYPE'] = data['ICMP_TYPE'] // 256

    # Compute other statistics
    data['IN_BPS'] = data['IN_BYTES'] / data['IN_DUR_S']    # Client's bytes-per-second
    data['IN_BPP'] = data['IN_BYTES'] / data['IN_PKTS']     # Client's bytes-per-packet
    data['IN_PPS'] = data['IN_PKTS'] / data['IN_DUR_S']     # Client's packets-per-second
    data['OUT_BPS'] = data['OUT_BYTES'] / data['OUT_DUR_S'] # Server's bytes-per-second
    data['OUT_BPP'] = data['OUT_BYTES'] / data['OUT_PKTS']  # Server's bytes-per-packet
    data['OUT_PPS'] = data['OUT_PKTS'] / data['OUT_DUR_S']  # Server's bytes-per-second

    # Fix bytes-per-packet values if number of packets was 0. This is a pretty ugly
    # principle, as many divisions by 0 occur. Maybe the computation itself would need fix
    data['IN_BPP']  = data['IN_BPP'].apply(lambda x : 0 if np.isnan(x) else x)
    data['OUT_BPP'] = data['OUT_BPP'].apply(lambda x : 0 if np.isnan(x) else x)
    
    # Perform one-hot encoding of categorical variables
    data = pd.get_dummies(data, columns=FEATURES_CATEGORICAL)

    # Scale existing and newly computed numerical variables
    features_to_scale = FEATURES_NUMERICAL + FEATURES_NUMERICAL_COMPUTED

    data[features_to_scale] = MinMaxScaler().fit_transform(data[features_to_scale])

    # Drop features which cannot be used for classification and other auxilliary features
    data = data.drop(columns=FEATURES_DROP + FEATURES_DROP_PROCESSED)
    data = data.drop(columns=['IN_DUR_S', 'OUT_DUR_S'])

    return data

In [57]:
datax = vnet_preprocess(data)

In [58]:
datax.head()

Unnamed: 0,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,MIN_IP_PKT_LEN,MAX_IP_PKT_LEN,ICMP_TYPE,MIN_TTL,MAX_TTL,SRC_FRAGMENTS,DST_FRAGMENTS,SRC_TO_DST_AVG_THROUGHPUT,DST_TO_SRC_AVG_THROUGHPUT,NUM_PKTS_UP_TO_128_BYTES,NUM_PKTS_128_TO_256_BYTES,NUM_PKTS_256_TO_512_BYTES,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,NUM_PKTS_OVER_1514_BYTES,LONGEST_FLOW_PKT,SHORTEST_FLOW_PKT,RETRANSMITTED_IN_PKTS,RETRANSMITTED_OUT_PKTS,OOORDER_IN_PKTS,OOORDER_OUT_PKTS,DURATION_IN,DURATION_OUT,TCP_WIN_MIN_IN,TCP_WIN_MAX_IN,TCP_WIN_MSS_IN,TCP_WIN_SCALE_IN,TCP_WIN_MIN_OUT,TCP_WIN_MAX_OUT,TCP_WIN_MSS_OUT,TCP_WIN_SCALE_OUT,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,FLAG_FIN,FLAG_SYN,FLAG_RST,FLAG_PSH,FLAG_ACK,FLAG_URG,FLAG_ECE,FLAG_CWR,CLIENT_FLAG_FIN,CLIENT_FLAG_SYN,CLIENT_FLAG_RST,CLIENT_FLAG_PSH,CLIENT_FLAG_ACK,CLIENT_FLAG_URG,CLIENT_FLAG_ECE,CLIENT_FLAG_CWR,SERVER_FLAG_FIN,SERVER_FLAG_SYN,SERVER_FLAG_RST,SERVER_FLAG_PSH,SERVER_FLAG_ACK,SERVER_FLAG_URG,SERVER_FLAG_ECE,SERVER_FLAG_CWR,ICMP_CODE,IN_BPS,IN_BPP,IN_PPS,OUT_BPS,OUT_BPP,OUT_PPS,PROTOCOL_ESP,PROTOCOL_GRE,PROTOCOL_ICMP,PROTOCOL_ICMP6,PROTOCOL_OTHER,PROTOCOL_TCP,PROTOCOL_UDP,L4_SRC_PORT_CHAT,L4_SRC_PORT_DATA,L4_SRC_PORT_DNS,L4_SRC_PORT_EMAIL,L4_SRC_PORT_OTHER,L4_SRC_PORT_PLAYSTATION,L4_SRC_PORT_QUERY,L4_SRC_PORT_SHELL,L4_SRC_PORT_TLS,L4_SRC_PORT_VPN,L4_SRC_PORT_WEB_NOENC,L4_DST_PORT_CHAT,L4_DST_PORT_DATA,L4_DST_PORT_DNS,L4_DST_PORT_EMAIL,L4_DST_PORT_OTHER,L4_DST_PORT_PLAYSTATION,L4_DST_PORT_QUERY,L4_DST_PORT_SHELL,L4_DST_PORT_TLS,L4_DST_PORT_VPN,L4_DST_PORT_WEB_NOENC
0,1.22428e-07,0.0,0.0,0.0,0.0,0.002898,0,0.0,0.0,0.0,0.0,0.000494,0.0,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.002898,0.005113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000494,0.008108,0.02082,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,3.264746e-07,0.0,0.0,0.0,0.0,0.007728,0,0.0,0.0,0.0,0.0,0.000803,0.0,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.007728,0.013634,0.0,0.0,0.0,0.0,0.0,0.0,0.003586,0.003586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.000803,0.021622,0.02082,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2,4.489025e-07,0.0,0.0,0.0,0.0,0.010625,0,0.0,0.0,0.0,0.0,0.000989,0.0,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.010625,0.018747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000988,0.02973,0.02082,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,5.213391e-06,0.0,0.0,0.0,0.0,0.1234,0,0.0,0.0,0.0,0.0,0.008204,0.0,0.0,0.0,0.0,0.000199,0.0,0.0,0.1234,0.217725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.008204,0.34527,0.02082,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,3.264746e-07,0.0,0.0,0.0,0.0,0.007728,0,0.0,0.0,0.0,0.0,0.000803,0.0,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.007728,0.013634,0.0,0.0,0.0,0.0,0.0,0.0,0.007645,0.007645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.000803,0.021622,0.02082,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [61]:
datax.describe()

Unnamed: 0,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,MIN_IP_PKT_LEN,MAX_IP_PKT_LEN,ICMP_TYPE,MIN_TTL,MAX_TTL,SRC_FRAGMENTS,DST_FRAGMENTS,SRC_TO_DST_AVG_THROUGHPUT,DST_TO_SRC_AVG_THROUGHPUT,NUM_PKTS_UP_TO_128_BYTES,NUM_PKTS_128_TO_256_BYTES,NUM_PKTS_256_TO_512_BYTES,NUM_PKTS_512_TO_1024_BYTES,NUM_PKTS_1024_TO_1514_BYTES,NUM_PKTS_OVER_1514_BYTES,LONGEST_FLOW_PKT,SHORTEST_FLOW_PKT,RETRANSMITTED_IN_PKTS,RETRANSMITTED_OUT_PKTS,OOORDER_IN_PKTS,OOORDER_OUT_PKTS,DURATION_IN,DURATION_OUT,TCP_WIN_MIN_IN,TCP_WIN_MAX_IN,TCP_WIN_MSS_IN,TCP_WIN_SCALE_IN,TCP_WIN_MIN_OUT,TCP_WIN_MAX_OUT,TCP_WIN_MSS_OUT,TCP_WIN_SCALE_OUT,SRC_TO_DST_IAT_MIN,SRC_TO_DST_IAT_MAX,SRC_TO_DST_IAT_AVG,SRC_TO_DST_IAT_STDDEV,DST_TO_SRC_IAT_MIN,DST_TO_SRC_IAT_MAX,DST_TO_SRC_IAT_AVG,DST_TO_SRC_IAT_STDDEV,FLAG_FIN,FLAG_SYN,FLAG_RST,FLAG_PSH,FLAG_ACK,FLAG_URG,FLAG_ECE,FLAG_CWR,CLIENT_FLAG_FIN,CLIENT_FLAG_SYN,CLIENT_FLAG_RST,CLIENT_FLAG_PSH,CLIENT_FLAG_ACK,CLIENT_FLAG_URG,CLIENT_FLAG_ECE,CLIENT_FLAG_CWR,SERVER_FLAG_FIN,SERVER_FLAG_SYN,SERVER_FLAG_RST,SERVER_FLAG_PSH,SERVER_FLAG_ACK,SERVER_FLAG_URG,SERVER_FLAG_ECE,SERVER_FLAG_CWR,ICMP_CODE,IN_BPS,IN_BPP,IN_PPS,OUT_BPS,OUT_BPP,OUT_PPS,PROTOCOL_ESP,PROTOCOL_GRE,PROTOCOL_ICMP,PROTOCOL_ICMP6,PROTOCOL_OTHER,PROTOCOL_TCP,PROTOCOL_UDP,L4_SRC_PORT_CHAT,L4_SRC_PORT_DATA,L4_SRC_PORT_DNS,L4_SRC_PORT_EMAIL,L4_SRC_PORT_OTHER,L4_SRC_PORT_PLAYSTATION,L4_SRC_PORT_QUERY,L4_SRC_PORT_SHELL,L4_SRC_PORT_TLS,L4_SRC_PORT_VPN,L4_SRC_PORT_WEB_NOENC,L4_DST_PORT_CHAT,L4_DST_PORT_DATA,L4_DST_PORT_DNS,L4_DST_PORT_EMAIL,L4_DST_PORT_OTHER,L4_DST_PORT_PLAYSTATION,L4_DST_PORT_QUERY,L4_DST_PORT_SHELL,L4_DST_PORT_TLS,L4_DST_PORT_VPN,L4_DST_PORT_WEB_NOENC
count,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0,186415.0
mean,0.0001373902,0.000273,0.0001437386,0.0002,0.015849,0.086248,5.969761,0.145175,0.145743,2.4e-05,1.2e-05,0.000916,0.000849,0.000726,0.000253,0.000681,0.000259,0.000217,1.3e-05,0.086248,0.021807,0.000335,0.000166,0.000114,7.6e-05,0.155901,0.108889,0.175763,0.176691,0.031854,0.099994,0.100271,0.100517,0.020285,0.058682,0.032841,0.083589,0.044774,0.036581,0.013046,0.054228,0.021775,0.02825,0.138369,0.255441,0.086833,0.367873,0.468138,0.0,0.010402,0.010305,0.127801,0.253826,0.057544,0.357005,0.44822,0.0,0.009833,0.009307,0.086565,0.124518,0.044669,0.246638,0.395027,0.0,0.003841,0.0011,0.035861,0.001276,0.093645,0.014287,0.000965,0.1049,0.004013,0.002875,0.001084,0.03872,0.000139,0.000687,0.557793,0.398702,2.7e-05,0.000703,0.00449,0.019982,0.867548,0.000488,0.001819,0.002848,0.085186,0.005552,0.011356,4.8e-05,0.001287,0.142891,0.021425,0.618453,0.001298,0.005874,0.014688,0.159708,0.006346,0.027981
std,0.00346003,0.004043,0.003052219,0.003214,0.021552,0.121436,28.853558,0.188017,0.189191,0.003029,0.002999,0.004642,0.011804,0.007966,0.00487,0.007932,0.004905,0.003663,0.003004,0.121436,0.033225,0.007331,0.006168,0.006035,0.00422,0.275138,0.244315,0.340055,0.340823,0.064714,0.220924,0.266685,0.266944,0.05704,0.179317,0.117191,0.166788,0.120238,0.09657,0.076439,0.139136,0.079975,0.087527,0.345287,0.43611,0.281591,0.482228,0.498985,0.0,0.101456,0.100989,0.333869,0.4352,0.232879,0.479117,0.497313,0.0,0.098673,0.096024,0.281197,0.330172,0.206577,0.431055,0.488858,0.0,0.061856,0.033144,0.395665,0.005265,0.151004,0.015074,0.011862,0.197972,0.013916,0.053545,0.0329,0.192927,0.011809,0.026195,0.49665,0.489632,0.005179,0.0265,0.066857,0.13994,0.338982,0.022089,0.042605,0.053295,0.279159,0.074306,0.10596,0.006948,0.035858,0.349962,0.144798,0.485767,0.036007,0.076417,0.1203,0.366336,0.079409,0.164918
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.387002e-07,0.0,0.0,0.0,0.0,0.00966,0.0,0.0,0.0,0.0,0.0,2.6e-05,0.0,5.2e-05,0.0,0.0,0.0,0.0,0.0,0.00966,0.008948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.2e-05,0.025676,0.000376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.101852e-06,0.0,2.880509e-07,8e-06,0.016899,0.018353,0.0,0.0,0.0,0.0,0.0,0.000209,6e-06,0.000105,0.0,0.0,0.0,0.0,0.0,0.018353,0.013634,0.0,0.0,0.0,0.0,0.0,0.0,0.003326,0.003464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000618,0.034459,0.02082,7e-06,0.034667,6.3e-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.386658e-06,6e-05,2.337644e-06,2.4e-05,0.021969,0.118087,0.0,0.243137,0.243137,0.0,0.0,0.000942,9.3e-05,0.000314,0.000136,0.0,0.0,0.0,0.0,0.118087,0.020026,0.0,0.0,0.0,0.0,0.179727,0.025008,0.093782,0.093782,0.0,0.0,0.007706,0.007721,0.0,0.0,0.0,0.070234,0.022742,0.013708,0.0,0.012462,0.002741,0.000674,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.001066,0.083784,0.02082,0.000317,0.089467,0.009091,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Preprocessing notes and ideas
- Durations are often too short or 0s, normalizing to 1ms if duration is 0 is used for proper computations
- Throughput computations are not equal to bps computations. What are throughput computaitions after all?
- Completely dropped features:
   - `IPV4_SRC_ADDR`, `IPV6_SRC_ADDR`, `IPV4_DST_ADDR`, `IPV6_DST_ADDR` (IPs are not directly used for classification, only for windowing)
   - `DIRECTION` (currently only considered for metadata, not seeing usage for ID)
   - `FLOW_VERDICT` (all 0s, not informational)
   - `APPLICATION_ID` (interesting, but not currently seeing into what the value represents)
    