In [4]:
from pennylane import numpy as np
import pandas as pd
import pennylane as qml

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import time
import sys, os

utils_directory = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'data_preparation/ids')
sys.path.append(utils_directory)

from utils.flow_windows import Flows
from utils.probabilities import Probabilities
from utils.density_matrices import DensityMatrix

In [None]:
data_file_path = 'datasets/mirai/'
timing_log_file = 'datasets/mirai/timing_log.txt'

In [7]:
FLOW_DURATION = 1

In [8]:
def log_to_file(message, filename):
    with open(filename, 'a') as file:  # 'a' opens the file for appending
        file.write(message + "\n")

# DM

In [None]:
df_dm = pd.read_csv(f"{data_file_path}mirai_pcap_preprocessed_sdm.csv")
df_dm

In [11]:
df_dm = df_dm.loc[:, ~df_dm.columns.str.contains("^Unnamed")]
df_dm = df_dm.fillna(0)

In [12]:
protocol_series = df_dm['ip.proto'].copy()

In [None]:
flow_method = "time"

prob_bases = ["ip.proto", "tcp.dstport", "udp.dstport"]
prob_base = "ip.proto"
prob_method = "bayesian"

In [14]:
flows = Flows(df=df_dm, method=flow_method, flow_duration=1)
df_dm_flows = flows.main()

probs = Probabilities(
    df_flows=df_dm_flows, bases=prob_bases, base=prob_base, method=prob_method
)
df_dm_probs = probs.main()

In [15]:
time_columns = ["frame.time_epoch", "frame.date_time", "flow_window"]
probability_columns = ["dm_prob", "dm_prob_softmax"]
non_feature_columns = time_columns + probability_columns

In [16]:
def make_ml_dataset_from_dm(dm, feature_name):
    X_train, X_test, y_train, y_test = dm.main()

    X_train_df = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))
    X_test_df = pd.DataFrame(X_test.reshape(X_test.shape[0], -1))
    y_train_df = pd.DataFrame(y_train, columns=["label"])
    y_test_df = pd.DataFrame(y_test, columns=["label"])

    df_train = pd.concat([X_train_df, y_train_df], axis=1)
    df_test = pd.concat([X_test_df, y_test_df], axis=1)

    df_dm = pd.concat([df_train, df_test], axis=0)
    df_dm.to_csv(f"{data_file_path}DM_{feature_name}.csv", index=False)

In [None]:
time_columns = ["frame.time_epoch", "frame.date_time", "flow_window"]
probability_columns = ["dm_prob", "dm_prob_softmax"]
non_feature_columns = time_columns + probability_columns

feature_columns = [col for col in df_dm_probs.columns if col not in non_feature_columns + ['label']]
df_columns = df_dm_probs[feature_columns].columns.to_list()

In [22]:
df_dm_probs

Unnamed: 0,frame.time_epoch,frame.len,ip.proto,ip.ttl,ip.len,tcp.srcport,tcp.dstport,tcp.seq,tcp.len,tcp.window_size,...,icmp.type_0.0,icmp.type_3.0,icmp.type_8.0,icmp.code_0.0,arp.opcode_1.0,arp.opcode_2.0,frame.date_time,flow_window,dm_prob,dm_prob_softmax
0,1.540446e+09,60,6.0,255.0,44.0,21074.0,80.0,0.0,0.0,5840.0,...,False,False,False,False,False,False,2018-10-25 13:46:22.933899,1,0.079283,0.019014
1,1.540446e+09,60,6.0,255.0,44.0,20532.0,8280.0,0.0,0.0,5840.0,...,False,False,False,False,False,False,2018-10-25 13:46:22.933904,1,0.079283,0.019014
2,1.540446e+09,86,1.0,64.0,72.0,21074.0,80.0,33948381.0,0.0,5840.0,...,False,True,False,True,False,False,2018-10-25 13:46:22.934426,1,0.202751,0.021513
3,1.540446e+09,86,1.0,64.0,72.0,20532.0,8280.0,30023305.0,0.0,5840.0,...,False,True,False,True,False,False,2018-10-25 13:46:22.934636,1,0.202751,0.021513
4,1.540446e+09,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,2018-10-25 13:46:23.291054,1,0.000000,0.017565
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764132,1.540454e+09,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,2018-10-25 15:45:19.837515,7137,0.000000,0.005484
764133,1.540454e+09,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,2018-10-25 15:45:19.839396,7137,0.000000,0.005484
764134,1.540454e+09,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,2018-10-25 15:45:19.840611,7137,0.000000,0.005484
764135,1.540454e+09,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,2018-10-25 15:45:19.842369,7137,0.000000,0.005484


In [None]:
df_dm_probs.to_csv(f"{data_file_path}DM_STATS_COMBINED.csv", index=False)

In [None]:
feature_name = 'mirai'
start_time = time.time()
dm_uniform = DensityMatrix(df_probs=df_dm_probs, non_feature_columns=non_feature_columns, prob_mode="uniform", seed=0)
make_ml_dataset_from_dm(dm_uniform, 'mirai')
log_message = f"DM_{feature_name}.csv: {time.time() - start_time}"
log_to_file(log_message, timing_log_file)

# Stats-based

In [None]:
pd.set_option('display.max_columns', None)

df_stats = df_dm_probs.copy()
df_stats

In [None]:
def process_group(group):

    time_columns = ["frame.time_epoch", "frame.date_time", "flow_window"]
    probability_columns = ["dm_prob", "dm_prob_softmax"]
    non_feature_columns = time_columns + probability_columns

    selected_columns = [col for col in group.columns if col not in non_feature_columns]
    group = group[selected_columns]
    group = group.drop(columns=['label'])

    stats = ['min', 'max', 'median', 'mean', 'var']

    stats_df = group.agg(stats)

    flat_stats = stats_df.to_numpy().flatten()

    # Generate column names
    col_names = []
    for stat in stats:
        col_names.extend([f'{col}_{stat}' for col in group.columns])

    all_stats = np.concatenate([flat_stats])
    # Convert to pandas DataFrame for easy handling
    stats_series = pd.Series(all_stats, index=col_names)
    return stats_series


def stats_features(df):
    grouped = df.groupby("flow_window")
    features = grouped.apply(process_group)
    labels_filenames_folds = grouped.apply(lambda g: pd.Series({'label': g['label'].iloc[0]}))
    features_combined = features.join(labels_filenames_folds)
    return features_combined


start_time = time.time()
df_stats = stats_features(df_stats)
df_stats = df_stats.dropna(axis=1)

end_time = time.time()
log_message = f"ids_mirai_stats.csv: {end_time - start_time}"
log_to_file(log_message, timing_log_file)

df_stats.to_csv(f'{data_file_path}mirai_stats_preprocessed.csv', index=False)