In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
def read_all_csv_files(directory_path, features=None):
    # Initialize an empty list to store dataframes from CSV files
    dataframes = []

    # Get a list of all files in the directory
    file_list = os.listdir(directory_path)

    # Loop through each file and check if it's a CSV file
    for file_number, file_name in enumerate(file_list):
        if file_name.endswith('.csv'):
            # Get the full file path
            file_path = os.path.join(directory_path, file_name)
            # Read the CSV file into a pandas DataFrame
            df = pd.read_csv(file_path)
            # Remove leading and trailing spaces from column names
            df.columns = df.columns.str.strip()
            # Append the DataFrame to the list
            df['CSV_File_Number'] = file_number
            #dataframes.append(df[features])
            dataframes.append(df) #as it is feature analysis, we are taking all the features

    # Merge all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

In [3]:
def sanitize_data_frames_updated(dataframe, remove_infinity=True, remove_null=True):

    if remove_infinity:
        numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
        infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()
        for col, count in infinite_counts.items():
            if count != 0:
                dataframe = dataframe[~np.isinf(dataframe[col])]

    if remove_null:
        null_counts = dataframe.isnull().sum()
        for col, count in null_counts.items():
            if count != 0:
                    dataframe = dataframe.dropna(subset=[col])
    print("Sanitized Row Count:", dataframe.shape[0])    
    return dataframe

In [28]:
#all dataset sources to make iterate to read csv files
dataset_sources = {

    ## Benign Traffic      
    # 'client_1': {
    #     'benign': '../row_data/client_1/benign',
    #     'attack': '../row_data/client_1/attack',
    # },
    # 'client_2': {
    #     'benign': '../row_data/client_2/benign',
    #     'attack': '../row_data/client_2/attack',
    # },
    'client_3': {
        'benign': '../row_data/client_3/benign',
        'attack': '../row_data/client_3/attack',
    },
    'client_4': {
        'benign': '../row_data/client_4/benign',
        'attack': '../row_data/client_4/attack',
    },

}

In [30]:
#get Uniform Sample
def get_uniform_sample(df, group_col=[], sample_size=420000):
    
    #Separate the labels
    label_0 = df[df['Label'] == 0]
    label_1 = df[df['Label'] == 1]

    count_0 = count_1 = 0 # Initialize with 0

    if len(label_0) >= sample_size//2 and len(label_1) >= sample_size//2:
        count_0 = count_1 = sample_size//2
    elif len(label_0) < sample_size//2 and len(label_1) > sample_size//2:
        count_0 = len(label_0)
        count_1 = sample_size - count_0
    elif len(label_0) > sample_size//2 and len(label_1) < sample_size//2:
        count_1 = len(label_1)
        count_0 = sample_size - count_1
    else:
        count_0 = len(label_0)
        count_1 = len(label_1)

    print(f"count_0: {count_0}; count_1: {count_1}")
    print(f"label_0: {len(label_0)}; label_1: {len(label_1)}")

    train_size_0 = 1.0 if count_0 >= len(label_0) else float(count_0/len(label_0))
    train_size_1 = 1.0 if count_1 >= len(label_1) else float(count_1/len(label_1))

    print(f"train_size_0: {train_size_0}; train_size_1: {train_size_1}")
    
# Handle edge cases where train_size is 1.0 and stratification might fail
    if train_size_0 == 1.0:
        sample_0 = label_0.copy()  # Use the whole subset without train_test_split
    else:
        sample_0, _ = train_test_split(
            label_0,
            train_size=train_size_0,
            stratify=label_0['CSV_File_Number'] if group_col else None,
            random_state=42
        )

    if train_size_1 == 1.0:
        sample_1 = label_1.copy()  # Use the whole subset without train_test_split
    else:
        sample_1, _ = train_test_split(
            label_1,
            train_size=train_size_1,
            stratify=label_1['CSV_File_Number'] if group_col else None,
            random_state=42
        )   
    
    # Concatenate and shuffle the samples
    combined_sample = pd.concat([sample_0, sample_1])
    shuffled_sample = combined_sample.sample(frac=1, random_state=42).reset_index(drop=True)

    return shuffled_sample

In [31]:
##Reading all the data
features = None


for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    locals()[client] = {}

for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    client_dataframe = []
    for type, path in data.items():
        print(f' Loading....Clinet = {client}, type = {type}')
        dataframe = read_all_csv_files(path, features)
        dataframe = sanitize_data_frames_updated(dataframe)
        if type == 'benign':
            dataframe['Label'] = 0
        else:
             dataframe['Label'] = 1
        client_dataframe.append(dataframe)
        #locals()[client][type] = dataframe
    client_merged_df = get_uniform_sample(pd.concat(client_dataframe, ignore_index=True), ['Label', 'CSV_File_Number'], 420000 )
    client_merged_df.to_csv(f'./dataset/{client}_dataset.csv', index=False)
    

100%|██████████████████████████████████████████| 2/2 [00:00<00:00, 18517.90it/s]
  0%|                                                     | 0/2 [00:00<?, ?it/s]

 Loading....Clinet = client_3, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 223200
 Loading....Clinet = client_3, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2348465
count_0: 210000; count_1: 210000
label_0: 223200; label_1: 2348465
train_size_0: 0.9408602150537635; train_size_1: 0.08942011058287008


 50%|██████████████████████                      | 1/2 [03:24<03:24, 204.86s/it]

 Loading....Clinet = client_4, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 246797
 Loading....Clinet = client_4, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2253978
count_0: 210000; count_1: 210000
label_0: 246797; label_1: 2253978
train_size_0: 0.850901753262803; train_size_1: 0.09316861122868103


100%|████████████████████████████████████████████| 2/2 [06:50<00:00, 205.33s/it]
