In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
def read_all_csv_files(directory_path, features=None):
    # Initialize an empty list to store dataframes from CSV files
    dataframes = []

    # Get a list of all files in the directory
    file_list = os.listdir(directory_path)

    # Loop through each file and check if it's a CSV file
    for file_number, file_name in enumerate(file_list):
        if file_name.endswith('.csv'):
            # Get the full file path
            file_path = os.path.join(directory_path, file_name)
            # Read the CSV file into a pandas DataFrame
            df = pd.read_csv(file_path)
            # Remove leading and trailing spaces from column names
            df.columns = df.columns.str.strip()
            # Append the DataFrame to the list
            df['CSV_File_Number'] = file_number
            #dataframes.append(df[features])
            dataframes.append(df) #as it is feature analysis, we are taking all the features

    # Merge all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

In [3]:
def sanitize_data_frames_updated(dataframe, remove_infinity=True, remove_null=True):

    if remove_infinity:
        numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
        infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()
        for col, count in infinite_counts.items():
            if count != 0:
                dataframe = dataframe[~np.isinf(dataframe[col])]

    if remove_null:
        null_counts = dataframe.isnull().sum()
        for col, count in null_counts.items():
            if count != 0:
                    dataframe = dataframe.dropna(subset=[col])
    print("Sanitized Row Count:", dataframe.shape[0])    
    return dataframe

In [4]:
#all dataset sources to make iterate to read csv files
dataset_sources = {

    ## Benign Traffic      
    'client_1': {
        'benign': '../row_data/client_1/benign',
        'attack': '../row_data/client_1/attack',
    },
    'client_2': {
        'benign': '../row_data/client_2/benign',
        'attack': '../row_data/client_2/attack',
    },
    'client_3': {
        'benign': '../row_data/client_3/benign',
        'attack': '../row_data/client_3/attack',
    },
    'client_4': {
        'benign': '../row_data/client_4/benign',
        'attack': '../row_data/client_4/attack',
    },

}

In [5]:
#get Uniform Sample
def get_uniform_sample(df, group_col=[], sample_size=42000):
    
    #Separate the labels
    label_0 = df[df['Label'] == 0]
    label_1 = df[df['Label'] == 1]

    count_0 = count_1 = 0 # Initialize with 0

    if len(label_0) >= sample_size//2 and len(label_1) >= sample_size//2:
        count_0 = count_1 = sample_size//2,
    elif len(label_0) < sample_size//2 and len(label_1) > sample_size//2:
        count_0 = len(label_0)
        count_1 = sample_size - count_0
    elif len(label_0) > sample_size//2 and len(label_1) < sample_size//2:
        count_1 = len(label_1)
        count_0 = sample_size - count_1
    else:
        count_0 = len(label_0)
        count_1 = len(label_1)

    sample_0, _ = train_test_split(
        label_0,
        train_size=count_0,
        stratify=df['CSV_File_Number'],
        random_state=42
    )
    
    sample_1, _ = train_test_split(
        label_1,
        train_size=count_1,
        stratify=df['CSV_File_Number'],
        random_state=42
    )        
    
    # Concatenate and shuffle the samples
    combined_sample = pd.concat([sample_0, sample_1])
    shuffled_sample = combined_sample.sample(frac=1, random_state=42).reset_index(drop=True)

    return shuffled_sample

In [19]:
##Reading all the data
features = None


for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    locals()[client] = {}

for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    client_dataframe = []
    for type, path in data.items():
        print(f' Loading....Clinet = {client}, type = {type}')
        dataframe = read_all_csv_files(path, features)
        dataframe = sanitize_data_frames_updated(dataframe)
        if type == 'benign':
            dataframe['Label'] = 0
        else:
             dataframe['Label'] = 1
        client_dataframe.append(dataframe)
        #locals()[client][type] = dataframe
    client_merged_df = get_uniform_sample(pd.concat(client_dataframe, ignore_index=True), ['Label', 'CSV_File_Number'], 420000 )
    client_merged_df.to_csv(f'./dataset/{client}/dataset.csv', index=False)

    #Remove unwanted column
    client_merged_df =  client_merged_df.drop(['CSV_File_Number'], axis=1)
    train_size = 0.8  # 80% for training, 20% for testing
    train_df, test_df = train_test_split(client_merged_df, train_size=train_size, random_state=42, stratify=client_merged_df['Label'])
   


    scaler_path = './scalers/{client}.pkl'

    train_df = scale_dataset(train_df, 'train', scaler_path)    
    train_df.to_csv(f'./dataset/{client}/train/{client}_train.csv', index=False)

    test_df = scale_dataset(test_df, 'test', scaler_path)
    test_df.to_csv(f'./dataset/{client}/test/{client}_test.csv', index=False)    
   
    


    

100%|████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 23464.64it/s]
  0%|                                                                                                   | 0/4 [00:00<?, ?it/s]

 Loading....Clinet = client_1, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 202821
 Loading....Clinet = client_1, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2671528


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
 25%|██████████████████████▊                                                                    | 1/4 [01:35<04:45, 95.06s/it]

 Loading....Clinet = client_2, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 185943
 Loading....Clinet = client_2, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 3346296


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
 50%|█████████████████████████████████████████████                                             | 2/4 [03:38<03:43, 111.83s/it]

 Loading....Clinet = client_3, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 223200
 Loading....Clinet = client_3, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2348465


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
 75%|███████████████████████████████████████████████████████████████████▌                      | 3/4 [05:15<01:44, 104.93s/it]

 Loading....Clinet = client_4, type = benign


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 246797
 Loading....Clinet = client_4, type = attack


  infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()


Sanitized Row Count: 2253978


  group.groupby('CSV_File_Number').apply(
  group.groupby('CSV_File_Number').apply(
100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:46<00:00, 101.52s/it]


In [39]:
def inspect_dataset(file_path):
    df = pd.read_csv(file_path)

    # Step 2: Create a pivot table
    pivot_table = pd.pivot_table(df, values='PSH Flag Count',  # Replace 'AnyValueColumn' with the column you want to aggregate
                                 index='CSV_File_Number',
                                 columns='Label',
                                 aggfunc='count')  # 'count' or any other function depending on your needs
    
   # Step 3: Calculate total records based on Label
    label_totals = df['Label'].value_counts()
    
    # Step 4: Display the pivot table and the label totals
    print("Pivot Table:")
    print(pivot_table)
    print("\nTotal Records by Label:")
    print(label_totals)
    

In [40]:
#all dataset sources to make iterate to read csv files
dataset_sources = {

    ## Benign Traffic      
    'client_1': './dataset/client_1/dataset.csv',
    'client_2': './dataset/client_2/dataset.csv',
    'client_3': './dataset/client_3/dataset.csv',
    'client_4': './dataset/client_4/dataset.csv',
    

}

for key, data_source in dataset_sources.items():
    print(key)
    inspect_dataset(data_source)
    print("___________________________________________________________________")
    

client_1
Pivot Table:
Label                  0        1
CSV_File_Number                  
0                50705.0      4.0
1                50705.0  20282.0
2                    3.0  20282.0
3                48231.0      NaN
4                    NaN  20282.0
5                    NaN  20282.0
6                    NaN  20282.0
7                    NaN  20282.0
8                    NaN  20282.0
9                    NaN  20282.0
10                   NaN  20282.0

Total Records by Label:
Label
1    182542
0    149644
Name: count, dtype: int64
___________________________________________________________________
client_2
Pivot Table:
Label                  0        1
CSV_File_Number                  
0                46485.0  18594.0
1                    3.0  18594.0
2                46485.0  18594.0
3                23669.0  18594.0
4                    NaN  18594.0
5                    NaN  18594.0
6                    NaN  18594.0
7                    NaN  18594.0
8                    NaN 