In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [3]:
def read_all_csv_files(directory_path, features):
    # Initialize an empty list to store dataframes from CSV files
    dataframes = []

    # Get a list of all files in the directory
    file_list = os.listdir(directory_path)

    # Loop through each file and check if it's a CSV file
    for file_number, file_name in enumerate(file_list):
        if file_name.endswith('.csv'):
            # Get the full file path
            file_path = os.path.join(directory_path, file_name)
            # Read the CSV file into a pandas DataFrame
            df = pd.read_csv(file_path)
            # Remove leading and trailing spaces from column names
            df.columns = df.columns.str.strip()
            # Append the DataFrame to the list
            df['CSV_File_Number'] = file_number
            dataframes.append(df[features])

    # Merge all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

In [4]:
def sanitize_data_frames_updated(dataframe, remove_infinity=True, remove_null=True):

    if remove_infinity:
        numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
        infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()
        for col, count in infinite_counts.items():
            if count != 0:
                dataframe = dataframe[~np.isinf(dataframe[col])]

    if remove_null:
        null_counts = dataframe.isnull().sum()
        for col, count in null_counts.items():
            if count != 0:
                    dataframe = dataframe.dropna(subset=[col])
    print("Sanitized Row Count:", dataframe.shape[0])    
    return dataframe

In [5]:
#all dataset sources to make iterate to read csv files
dataset_sources = {

    ## Benign Traffic      
    'client_1': {
        'benign': './row_data/client_1/benign',
        'attack': './row_data/client_1/attack',
    },
    'client_2': {
        'benign': './row_data/client_2/benign',
        'attack': './row_data/client_2/attack',
    },
    'client_3': {
        'benign': './row_data/client_3/benign',
        'attack': './row_data/client_3/attack',
    },
    'client_4': {
        'benign': './row_data/client_4/benign',
        'attack': './row_data/client_4/attack',
    },

}

In [13]:
# ##
# def get_uniform_sample(df, group_col, sample_size):
#       # Shuffle the data within each label based on File Number/Index
#     df = df.groupby(group_col).apply(lambda x: x.sample(frac=1)).reset_index(drop=True)

#     # Randomize the whole data again
#     df = df.sample(frac=1).reset_index(drop=True)

#     # Grab a sample ensuring uniform distribution of labels and File Number/Index
#     min_count = min(df['Label'].value_counts().min(), sample_size // 2)
#     sample_df = pd.concat([df[df['Label'] == 0].groupby('CSV_File_Number').apply(lambda x: x.sample(min_count // len(x['CSV_File_Number'].unique()), random_state=42)).reset_index(drop=True),
#                            [df['Label'] == 1].groupby('CSV_File_Number').apply(lambda x: x.sample(min_count // len(x['CSV_File_Number'].unique()), random_state=42)).reset_index(drop=True)])
    
#     return sample_df.sample(frac=1).reset_index(drop=True)

def get_uniform_sample(df, group_col, sample_size):
    # Determine the minimum count available in any label group
    min_count = min(df['Label'].value_counts().min(), sample_size // 2)

    # Sample from each group while respecting the available counts
    sampled_dfs = []
    for label in df['Label'].unique():
        group = df[df['Label'] == label]
        sampled_dfs.append(
            group.groupby('CSV_File_Number').apply(
                lambda x: x.sample(
                    min(min_count // len(group['CSV_File_Number'].unique()), len(x)),
                    random_state=42
                )
            ).reset_index(drop=True)
        )

    sample_df = pd.concat(sampled_dfs, ignore_index=True)
    
    return sample_df.sample(frac=1).reset_index(drop=True)

In [25]:
##Reading all the data
features = ['Bwd IAT Std', 'Bwd IAT Max', 'Fwd Packet Length Std', 'Bwd IAT Total', 'Fwd Packet Length Min', 'PSH Flag Count',
            'Packet Length Variance', 'Fwd Packet Length Max', 'Down/Up Ratio', 'Bwd IAT Mean', 'FIN Flag Count', 'Packet Length Min', 
            'Active Std', 'Bwd Packet Length Min', 'Bwd IAT Min', 'FWD Init Win Bytes', 'URG Flag Count', 'Fwd IAT Total', 'Fwd IAT Std', 
            'ACK Flag Count', 'Flow IAT Mean', 'Flow IAT Min', 'Active Min', 'Bwd Packet Length Std', 'Packet Length Max', 'Active Mean', 
            'Bwd Packet Length Max', 'Idle Std', 'Active Max', 'Flow IAT Max', 'Label', 'CSV_File_Number']


for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    locals()[client] = {}

for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    client_dataframe = []
    for type, path in data.items():
        print(f' Loading....Clinet = {client}, type = {type}')
        dataframe = read_all_csv_files(path, features)
        dataframe = sanitize_data_frames_updated(dataframe)
        if type == 'benign':
            dataframe['Label'] = 0
        else:
             dataframe['Label'] = 1
        client_dataframe.append(dataframe)
        #locals()[client][type] = dataframe
    client_merged_df = get_uniform_sample(pd.concat(client_dataframe, ignore_index=True), ['Label', 'CSV_File_Number'], 420000 )
    client_merged_df.to_csv(f'./dataset/{client}/dataset.csv', index=False)

    #Remove unwanted column
    client_merged_df =  client_merged_df.drop(['CSV_File_Number'], axis=1)
    train_size = 0.8  # 80% for training, 20% for testing
    train_df, test_df = train_test_split(client_merged_df, train_size=train_size, random_state=42, stratify=client_merged_df['Label'])
    train_df.to_csv(f'./dataset/{client}/train/{client}_train.csv', index=False)
    test_df.to_csv(f'./dataset/{client}/test/{client}_test.csv', index=False)
    
    


    

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 21426.84it/s]
  0%|                                                                                                                       | 0/4 [00:00<?, ?it/s]

 Loading....Clinet = client_1, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 202821
 Loading....Clinet = client_1, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2671528


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
 25%|███████████████████████████▊                                                                                   | 1/4 [01:33<04:39, 93.13s/it]

 Loading....Clinet = client_2, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 185943
 Loading....Clinet = client_2, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 3346296


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
 50%|███████████████████████████████████████████████████████                                                       | 2/4 [03:21<03:24, 102.10s/it]

 Loading....Clinet = client_3, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 223200
 Loading....Clinet = client_3, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2348465


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
 75%|███████████████████████████████████████████████████████████████████████████████████▎                           | 3/4 [04:43<01:32, 92.91s/it]

 Loading....Clinet = client_4, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 246797
 Loading....Clinet = client_4, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2253978


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:01<00:00, 90.43s/it]
