In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle

Utilities Functions

In [65]:
FEATURES = ['Flow Duration' ,'Total Fwd Packet' ,'Total Bwd packets' ,'Total Length of Fwd Packet' ,'Total Length of Bwd Packet' ,'Fwd Packet Length Max' ,'Fwd Packet Length Min' ,'Fwd Packet Length Std' ,'Bwd Packet Length Max' ,'Bwd Packet Length Min' ,'Bwd Packet Length Std' ,'Flow IAT Mean' ,'Flow IAT Std' ,'Flow IAT Max' ,'Flow IAT Min' ,'Fwd IAT Total' ,'Fwd IAT Std' ,'Fwd IAT Max' ,'Fwd IAT Min' ,'Bwd IAT Total' ,'Bwd IAT Std' ,'Bwd IAT Max' ,'Bwd IAT Min' ,'Fwd Header Length' ,'Bwd Header Length' ,'Packet Length Min' ,'Packet Length Max' ,'Packet Length Std' ,'Packet Length Variance' ,'FWD Init Win Bytes' ,'Bwd Init Win Bytes' ,'Fwd Act Data Pkts' ,'Fwd Seg Size Min' ,'Active Mean' ,'Active Std' ,'Active Max' ,'Active Min' ,'Idle Mean' ,'Idle Std' ,'Idle Max' ,'Idle Min', 'Label', 'Type']

In [72]:
def sanitize_data_frame(dataframe, remove_infinity=True, remove_null=True):
    """
    Cleans a given DataFrame by removing infinite and/or null values without removing columns.
    """

    original_size = dataframe.shape[0]

    print("Before sanitization")
    print(dataframe.shape)

    # Remove infinite values (keep all columns)
    if remove_infinity:
        numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
        dataframe[numeric_cols] = dataframe[numeric_cols].replace([np.inf, -np.inf], np.nan)

    # Remove null values (drop rows with NaN but keep columns)
    if remove_null:
        dataframe = dataframe.dropna()

    sanitized_size = dataframe.shape[0]

    # print(f"Original Row Count: {original_size}")
    # print(f"Sanitized Row Count: {sanitized_size}")
    # print(f"Rows Removed: {original_size - sanitized_size}")

    # print("After sanitization")
    #print(dataframe.shape)  # 🔹 Now, column count should remain unchanged

    return dataframe

def read_all_csv_files(directory_path, features=None):
    # Initialize an empty list to store dataframes from CSV files
    dataframes = []
    # Get a list of all files in the directory
    file_list = os.listdir(directory_path)

    # Loop through each file and check if it's a CSV file
    for file_number, file_name in enumerate(file_list):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory_path, file_name)
            df = pd.read_csv(file_path)
            df.columns = df.columns.str.strip()
            df['Type'] = f"benign_{file_number}"
            df['Label'] = 0
            dataframes.append(df[FEATURES]) #as it is feature analysis, we are taking all the features

    # Merge all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df


def read_attack_csv_files(attacks, path):
    dataframes = []
    for attack_type, attack in attacks.items():        
        file_path = os.path.join(path, attack)      
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        df['Type'] = attack_type
        df['Label'] = 1
        #print(attack_type)
        #print(df.shape)
        dataframes.append(df[FEATURES])

   # Merge all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)
    #print(merged_df.shape)
    return merged_df

##Sample recrods
def sample_df(df, stratify, train_size, random_state, client, category):
    """
    Samples a dataframe while maintaining stratification and ensuring randomization.
    """

    # Ensure `train_size` does not exceed available data
    train_size = min(train_size, df.shape[0])

    # If train_size == df.shape[0], return shuffled df directly (to ensure randomness)
    if train_size == df.shape[0]:
        sampled_df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    else:
        sampled_df, _ = train_test_split(
            df, train_size=int(train_size), stratify=df[stratify], random_state=random_state
        )

    # Save the sampled dataset
    sampled_df.to_csv(ensure_directory_exists(client['save_path'].format(category)), index=False)
    
    split_dataframe(sampled_df, split=5, client=client, category=category)

    return sampled_df


def split_dataframe(df, split, client, category):
  
    # Split into 5 parts
    split_dfs = np.array_split(df, 5)

    # Save each split as a separate CSV file
    for i, split_df in enumerate(split_dfs):
        path = client['split_path'].format(i+1, category)
        split_df.to_csv(ensure_directory_exists(path), index=False)



# Create directory if it doesn't exist
# THis will create subdirectoy as well
def ensure_directory_exists(filepath):
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)
    return filepath


def shuffle_dataframe_stratified(df, stratify_col, random_state=42):
    """
    Shuffles a DataFrame while maintaining stratification on a given column.

    Parameters:
    - df (pd.DataFrame): The DataFrame to shuffle.
    - stratify_col (str): Column name to stratify by.
    - random_state (int): Seed for reproducibility.

    Returns:
    - pd.DataFrame: Shuffled and stratified DataFrame.
    """
    # Use train_test_split with train_size=1.0 to shuffle while keeping stratification
    # shuffled_df, _ = train_test_split(
    #     df, train_size=None, stratify=df[stratify_col], random_state=random_state
    # )

    # Step 1: Shuffle inside each group
    shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
        lambda x: x.sample(frac=1, random_state=random_state)
    )

    # Step 2: Shuffle the entire dataset
    shuffled_df = shuffled_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return shuffled_df


def scale_data(train_df, test_df, feature_columns, label_column, extra_column):
    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler on the training data only (excluding label and extra column)
    train_scaled_values = scaler.fit_transform(train_df[feature_columns])

    # Transform the test data using the same scaler
    test_scaled_values = scaler.transform(test_df[feature_columns])

    # Store the scaled values back into DataFrames
    train_scaled = train_df.copy()
    test_scaled = test_df.copy()

    train_scaled[feature_columns] = train_scaled_values
    test_scaled[feature_columns] = test_scaled_values

    # Ensure label and extra column are preserved without modifications
    train_scaled[label_column] = train_df[label_column]
    test_scaled[label_column] = test_df[label_column]

    train_scaled[extra_column] = train_df[extra_column]
    test_scaled[extra_column] = test_df[extra_column]

    # # Save the scaler to a file for future use
    # with open(scaler_path, "wb") as scaler_file:
    #     pickle.dump(scaler, scaler_file)

    # print(f"Scaler saved at: {scaler_path}")

    return train_scaled, test_scaled, scaler


Some Variables for loop

In [59]:
# Raw data extracted from the file structure
file_structure = {
    "client_1": {
        "syn": "tcp_syn.pcap_Flow.csv",
        "fin": "tcp_fin.pcap_Flow.csv",
        "ack": "tcp_ack.pcap_Flow.csv",
        "xmas": "tcp_xmas.pcap_Flow.csv",
        "ymas": "tcp_ymas.pcap_Flow.csv",
        "push": "tcp_push.pcap_Flow.csv",
        "urg": "tcp_urg.pcap_Flow.csv",
        "udp": "udp_flood.pcap_Flow.csv"
    },
    "client_2": {
        "syn": "upf2_tcp_syn.pcap_Flow.csv",
        "fin": "upf2_tcp_fin.pcap_Flow.csv",
        "ack": "upf2_tcp_ack.pcap_Flow.csv",
        "xmas": "upf2_tcp_xmas.pcap_Flow.csv",
        "ymas": "upf2_tcp_ymas.pcap_Flow.csv",
        "push": "upf2_tcp_push.pcap_Flow.csv",
        "urg": "upf2_tcp_urg.pcap_Flow.csv",
        "udp": "upf2_upd_flood.pcap_Flow.csv"
    },
    "client_3": {
        "syn": "syn1.pcap_Flow.csv",
        "fin": "fin1.pcap_Flow.csv",
        "ack": "ack1.pcap_Flow.csv",
        "xmas": "xmas1.pcap_Flow.csv",
        "ymas": "ymas1.pcap_Flow.csv",
        "push": "push1.pcap_Flow.csv",
        "urg": "urg1.pcap_Flow.csv",
        "udp": "12udpfloodupf1.pcap_Flow.csv"
    },
    "client_4": {
        "syn": "syn2.pcap_Flow.csv",
        "fin": "fin2.pcap_Flow.csv",
        "ack": "ack2.pcap_Flow.csv",
        "xmas": "xmas2.pcap_Flow.csv",
        "ymas": "ymas2.pcap_Flow.csv",
        "push": "push2.pcap_Flow.csv",
        "urg": "urg2.pcap_Flow.csv",
        "udp": "12udpfloodupf2.pcap_Flow.csv"
    },
}

#all dataset sources to make iterate to read csv files
dataset_sources = {
 
    'client_1': {
        "path": {
            'benign': '../row_data_IID/{}/benign',
            'attack': '../row_data_IID/{}/attack',
        },
        'known_attacks': ['fin', 'udp'],
        'save_path': "./dataset_nonIID/client_1/client_1_{}.csv",
        'split_path': "./dataset_nonIID/client_1/split_{0}/client_1_{1}_{0}.csv"
    },
    'client_2': {
         "path": {
            'benign': '../row_data_IID/{}/benign',
            'attack': '../row_data_IID/{}/attack',
        },
        'known_attacks': ['ack', 'push'],
        'save_path': "./dataset_nonIID/client_2/client_2_{}.csv",
        'split_path': "./dataset_nonIID/client_2/split_{0}/client_2_{1}_{0}.csv"
    },
    'client_3': {
         "path": {
            'benign': '../row_data_IID/{}/benign',
            'attack': '../row_data_IID/{}/attack',
        },
        'known_attacks': ['syn', 'xmas'],
        'save_path': "./dataset_nonIID/client_3/client_3_{}.csv",
        'split_path': "./dataset_nonIID/client_3/split_{0}/client_3_{1}_{0}.csv"
    },
    'client_4': {
         "path": {
            'benign': '../row_data_IID/{}/benign',
            'attack': '../row_data_IID/{}/attack',
        },
        'known_attacks': ['urg', 'ymas'],
        'save_path': "./dataset_nonIID/client_4/client_4_{}.csv",
        'split_path': "./dataset_nonIID/client_4/split_{0}/client_4_{1}_{0}.csv"
    }

}

SEED = 42
TOTAL_RECORDS = 400000
CLASS_RATIO = 0.5 #50% Benign and 50% Attack
SPLIT = 5
PER_ATTACK_RATIO_IN_TRAINSET = 0.25
PER_ATTACK_RATIO_IN_TESTSET = 0.06

Main Logic

In [60]:
for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    known_attacks = dataset_sources[client]['known_attacks']

    benign_df = sanitize_data_frame(read_all_csv_files(data.get('path').get('benign').format(client)))
    attack_df = sanitize_data_frame(read_attack_csv_files(file_structure.get(client), path=data.get('path').get('attack').format(client)))
   

    known_attack_df = attack_df[attack_df['Type'].isin(known_attacks)]
    #benign_records = attack_records = TOTAL_RECORDS * CLASS_RATIO

    benign_available = benign_df.shape[0]
    attack_available = known_attack_df.shape[0]

    benign_target = int(TOTAL_RECORDS * CLASS_RATIO)  # Ideally 200,000 from benign
    attack_target = int(TOTAL_RECORDS * CLASS_RATIO)  # Ideally 200,000 from attack

    # Case 1: Both have enough data
    if benign_available >= benign_target and attack_available >= attack_target:
        benign_train_size = benign_target
        attack_train_size = attack_target

    # Case 2: Not enough benign, compensate with attack
    elif benign_available < benign_target and attack_available >= attack_target:
        benign_train_size = benign_available  # Take all available benign
        attack_train_size = TOTAL_RECORDS - benign_train_size  # Fill remaining from attack

    # Case 3: Not enough attack, compensate with benign
    elif benign_available >= benign_target and attack_available < attack_target:
        attack_train_size = attack_available  # Take all available attack
        benign_train_size = TOTAL_RECORDS - attack_train_size  # Fill remaining from benign

    # Case 4: Neither has enough data, take all available
    else:
        benign_train_size = benign_available
        attack_train_size = attack_available
        # Ensure total is still 400,000 (if both are lower, it will be less than 400,000)



    benign_df = sample_df(
        benign_df, stratify="Type", train_size=benign_train_size, random_state=SEED, 
        client = dataset_sources[client],
        category = "benign"
    )
    
    known_attack_df = sample_df(
        attack_df[attack_df['Type'].isin(known_attacks)], stratify="Type",
        train_size=TOTAL_RECORDS * CLASS_RATIO, random_state=SEED,
        client = dataset_sources[client],
        category = "known_attack"
    )
    
    unknow_recoreds = TOTAL_RECORDS * CLASS_RATIO * 6/8
    unknown_attack_df = sample_df(
        attack_df[~attack_df['Type'].isin(known_attacks)], stratify="Type", 
        train_size=unknow_recoreds, random_state=SEED,    
        client = dataset_sources[client],
        category = "unknown_attacks"
    )

  0%|                                                                                                                                   | 0/4 [00:00<?, ?it/s]

Before sanitization
(202821, 43)
Before sanitization
(2409383, 43)


  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
 25%|██████████████████████████████▊                                                                                            | 1/4 [00:36<01:49, 36.61s/it]

Before sanitization
(185943, 43)
Before sanitization
(2408730, 43)


  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
 50%|█████████████████████████████████████████████████████████████▌                                                             | 2/4 [01:10<01:10, 35.02s/it]

Before sanitization
(223200, 43)
Before sanitization
(2335440, 43)


  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
 75%|████████████████████████████████████████████████████████████████████████████████████████████▎                              | 3/4 [01:46<00:35, 35.65s/it]

Before sanitization
(246797, 43)
Before sanitization
(2232941, 43)


  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:21<00:00, 35.28s/it]


In [79]:
benign_splits = './dataset_nonIID/client_{0}/split_{1}/client_{0}_benign_{1}.csv'
known_attack_splits = './dataset_nonIID/client_{0}/split_{1}/client_{0}_known_attack_{1}.csv'
unknown_attack_splits = './dataset_nonIID/client_{0}/split_{1}/client_{0}_unknown_attacks_{1}.csv'
fold_path = './dataset_nonIID/client_{0}/fold_{1}/client_{0}_{2}_dataset.csv'
scaler_path = './dataset_nonIID/client_{0}/fold_{1}/client_{0}_train_scaler.pkl'

In [82]:
#features = ['Flow Duration' ,'Total Fwd Packet' ,'Total Bwd packets' ,'Total Length of Fwd Packet' ,'Total Length of Bwd Packet' ,'Fwd Packet Length Max' ,'Fwd Packet Length Min' ,'Fwd Packet Length Std' ,'Bwd Packet Length Max' ,'Bwd Packet Length Min' ,'Bwd Packet Length Std' ,'Flow IAT Mean' ,'Flow IAT Std' ,'Flow IAT Max' ,'Flow IAT Min' ,'Fwd IAT Total' ,'Fwd IAT Std' ,'Fwd IAT Max' ,'Fwd IAT Min' ,'Bwd IAT Total' ,'Bwd IAT Std' ,'Bwd IAT Max' ,'Bwd IAT Min' ,'Fwd Header Length' ,'Bwd Header Length' ,'Packet Length Min' ,'Packet Length Max' ,'Packet Length Std' ,'Packet Length Variance' ,'FWD Init Win Bytes' ,'Bwd Init Win Bytes' ,'Fwd Act Data Pkts' ,'Fwd Seg Size Min' ,'Active Mean' ,'Active Std' ,'Active Max' ,'Active Min' ,'Idle Mean' ,'Idle Std' ,'Idle Max' ,'Idle Min', 'Label', 'Type']
numeric_features = FEATURES.copy()
numeric_features.remove('Label')
numeric_features.remove('Type')

for client in range(1, 5):
    benign= []
    known = []
    unknown = []
    for split in range(1, 6):
        benign.append(pd.read_csv(benign_splits.format(client, split)))
        known.append(pd.read_csv(known_attack_splits.format(client, split)))
        unknown.append(pd.read_csv(unknown_attack_splits.format(client, split)))
    
    for i in range(5):

        #prepare Testset
        benign_test, known_test, unknown_test = benign[i], known[i], unknown[i]
        test_df = pd.concat([benign_test, known_test, unknown_test], ignore_index=True)

        #preparing Training set
        benign_train = pd.concat([benign[j] for j in range(5) if j != i], ignore_index=True)
        known_train = pd.concat([known[j] for j in range(5) if j != i], ignore_index=True)
        train_df = pd.concat([benign_train, known_train], ignore_index=True)

        ##Shuffling
        train_df = shuffle_dataframe_stratified(train_df, stratify_col="Type", random_state=SEED)
        test_df = shuffle_dataframe_stratified(test_df, stratify_col="Type", random_state=SEED)

        ##Scaling
        #scale_data(train_df, test_df, feature_columns, label_column, extra_column):
        sclaed_train_df, scaled_test_df, scaler = scale_data(train_df, test_df, numeric_features, 'Label', 'Type')
        #fold_path = './dataset_nonIID/client_{0}/fold_{1}/client_{0}_{2}_dataset.csv'

        sclaed_train_df.to_csv(ensure_directory_exists(fold_path.format(client, i+1, 'scaled_train')), index=False)
        scaled_test_df.to_csv(ensure_directory_exists(fold_path.format(client, i+1, 'scaled_test')), index=False)
        #scaler_path = './dataset_nonIID/client_{0}/fold_{1}/client_{0}_train_scaler.pkl'
        # Ensure the directory exists before saving the file
        path = ensure_directory_exists(scaler_path.format(client, i+1))
        with open(path, "wb") as scaler_file:
            pickle.dump(scaler, scaler_file)


        
        
       
        

    


        


  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled_df = df.groupby(stratify_col, group_keys=False).apply(
  shuffled