In [1]:
import pandas as pd
import glob
import ast 
import os


In [2]:
CSV_DIRECTORY = "/Data/CICDataset/CICBellEXFDNS2021/Dataset/CSV"
BENIGN_DIRECTORY = "/Benign"
LIGHT_ATTACK_DIRECTORY = "/Attack_Light_Benign/Attacks"
LIGHT_BENIGN_DIRECTORY = "/Attack_Light_Benign/Benign"
HEAVY_ATTACK_DIRECTORY = "/Attack_Heavy_Benign/Attacks"
HEAVY_BENIGN_DIRECTORY = "/Attack_Heavy_Benign/Benign"
DATA_TYPE_CATEGORIES = ["audio", "compressed", "exe", "image", "text", "video"]
STATEFUL_PREFIX = "/stateful_features-"
STATELESS_PREFIX = "/stateless_features-"
directories = {"benign": BENIGN_DIRECTORY,
              "light": {"attack": LIGHT_ATTACK_DIRECTORY, "benign": LIGHT_BENIGN_DIRECTORY},
              "heavy": {"attack": HEAVY_ATTACK_DIRECTORY, "benign": HEAVY_BENIGN_DIRECTORY}}

In [3]:
path = os.getcwd()
benign_csv_files = glob.glob(os.path.join(path+CSV_DIRECTORY+BENIGN_DIRECTORY, "*.csv"))
light_attack_csv_files = glob.glob(os.path.join(path+CSV_DIRECTORY+LIGHT_ATTACK_DIRECTORY, "*.csv"))
light_benign_csv_files = glob.glob(os.path.join(path+CSV_DIRECTORY+LIGHT_BENIGN_DIRECTORY, "*.csv"))
heavy_attack_csv_files = glob.glob(os.path.join(path+CSV_DIRECTORY+HEAVY_ATTACK_DIRECTORY, "*.csv"))
heavy_benign_csv_files = glob.glob(os.path.join(path+CSV_DIRECTORY+HEAVY_BENIGN_DIRECTORY, "*.csv"))


In [4]:
# code to create data for ensemble model

all_attack_files = []
all_attack_files.extend(light_attack_csv_files)
all_attack_files.extend(heavy_attack_csv_files)

len(all_attack_files)

24

In [5]:
def read_benign_files(benign_csv_files):
    stateless_df_list = []
    stateful_df_list = []
    for f in benign_csv_files:
        df = pd.read_csv(f)
        df["original_index"] = df.index
        df["data_type"] = "benign"
        df["attack"] = 0
        
        if "light" in f:
            df["origin"] = "light-benign"
        elif "heavy" in f:
            df["origin"] = "heavy-benign"
        else:
            df["origin"] = "only-benign"
            
        if "stateless" in f:        
            stateless_df_list.append(df)
        else:
            stateful_df_list.append(df)
    benign_stateless_df = pd.concat(stateless_df_list)
    benign_stateful_df = pd.concat(stateful_df_list) 
    return benign_stateless_df, benign_stateful_df

In [6]:
benign_stateless_df, benign_stateful_df = read_benign_files(benign_csv_files)
light_benign_stateless_df, light_benign_stateful_df = read_benign_files(light_benign_csv_files)
heavy_benign_stateless_df, heavy_benign_stateful_df = read_benign_files(heavy_benign_csv_files)

In [7]:
def read_attack_files(attack_csv_files):
    stateless_df_list = []
    stateful_df_list = []
    for f in attack_csv_files:
        for data_type in DATA_TYPE_CATEGORIES:
            if "stateless" in f and data_type in f:
                df = pd.read_csv(f)
                df["data_type"] = data_type
                df["original_index"] = df.index
                if "heavy" in f:
                    df["origin"] = "heavy-attack"
                else:
                    df["origin"] = "light-attack"
                df["attack"] = 1
                stateless_df_list.append(df)                
            elif "stateful" in f and data_type in f:
                df = pd.read_csv(f)
                df["data_type"] = data_type
                df["original_index"] = df.index
                df["attack"] = 1
                if "heavy" in f:
                    df["origin"] = "heavy-attack"
                else:
                    df["origin"] = "light-attack"
                stateful_df_list.append(df)
    attack_stateless_df = pd.concat(stateless_df_list)
    attack_stateful_df = pd.concat(stateful_df_list) 

    return attack_stateless_df, attack_stateful_df

In [8]:
light_attack_stateless_df, light_attack_stateful_df = read_attack_files(light_attack_csv_files)
heavy_attack_stateless_df, heavy_attack_stateful_df = read_attack_files(heavy_attack_csv_files)

In [9]:
heavy_benign_stateful_df.shape, light_benign_stateful_df.shape, benign_stateful_df.shape, heavy_attack_stateful_df.shape,light_attack_stateful_df.shape

((69016, 31), (22768, 31), (86998, 31), (72028, 31), (11295, 31))

In [10]:
heavy_benign_stateless_df.shape, light_benign_stateless_df.shape, benign_stateless_df.shape, heavy_attack_stateless_df.shape,light_attack_stateless_df.shape

((181694, 19), (60091, 19), (221073, 19), (251670, 19), (42683, 19))

# Prepare the combined dataframes

In [11]:
stateless_df = pd.concat([
    benign_stateless_df,
light_benign_stateless_df,
heavy_benign_stateless_df,
    light_attack_stateless_df,
heavy_attack_stateless_df
])
stateful_df = pd.concat([
    benign_stateful_df,
light_benign_stateful_df,
heavy_benign_stateful_df,
    light_attack_stateful_df,
heavy_attack_stateful_df
])

In [13]:
stateful_df.to_csv("Data/Prepared Data/stateful.csv", index=None)
stateless_df.to_csv("Data/Prepared Data/stateless.csv", index=None)