In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.decomposition import PCA

## 1. Overall DataSet Preparation from Row data
- Read all the row feature files from the directory specified
- Sanitize all the inputs
- Take sample count based on attack and benign csv files (technically it's class)
- Save the dataset locally

In [2]:
def read_all_csv_files(directory_path, features=None):
    # Initialize an empty list to store dataframes from CSV files
    dataframes = []

    # Get a list of all files in the directory
    file_list = os.listdir(directory_path)

    # Loop through each file and check if it's a CSV file
    for file_number, file_name in enumerate(file_list):
        if file_name.endswith('.csv'):
            # Get the full file path
            file_path = os.path.join(directory_path, file_name)
            # Read the CSV file into a pandas DataFrame
            df = pd.read_csv(file_path)
            # Remove leading and trailing spaces from column names
            df.columns = df.columns.str.strip()
            # Append the DataFrame to the list
            df['CSV_File_Number'] = file_number
            #dataframes.append(df[features])
            dataframes.append(df) #as it is feature analysis, we are taking all the features
            #print(f"file number: {file_number}; file name: {file_name}")

    # Merge all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

In [3]:
def sanitize_data_frames_updated(dataframe, remove_infinity=True, remove_null=True):

    if remove_infinity:
        numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
        infinite_counts = dataframe[numeric_cols].applymap(np.isinf).sum()
        for col, count in infinite_counts.items():
            if count != 0:
                dataframe = dataframe[~np.isinf(dataframe[col])]

    if remove_null:
        null_counts = dataframe.isnull().sum()
        for col, count in null_counts.items():
            if count != 0:
                    dataframe = dataframe.dropna(subset=[col])
    print("Sanitized Row Count:", dataframe.shape[0])    
    return dataframe

In [4]:
#all dataset sources to make iterate to read csv files
dataset_sources = {
 
    'client_1': {
        'benign': '../row_data_nonIID/client_1/benign',
        'attack': '../row_data_nonIID/client_1/attack',
    },
    'client_2': {
        'benign': '../row_data_nonIID/client_2/benign',
        'attack': '../row_data_nonIID/client_2/attack',
    },
    'client_3': {
        'benign': '../row_data_nonIID/client_3/benign',
        'attack': '../row_data_nonIID/client_3/attack',
    },
    'client_4': {
        'benign': '../row_data_nonIID/client_4/benign',
        'attack': '../row_data_nonIID/client_4/attack',
    }

}

In [5]:
#get Uniform Sample
def get_uniform_sample(df, group_col=[], sample_size=420000):
    
    #Separate the labels
    label_0 = df[df['Label'] == 0]
    label_1 = df[df['Label'] == 1]

    count_0 = count_1 = 0 # Initialize with 0

    if len(label_0) >= sample_size//2 and len(label_1) >= sample_size//2:
        count_0 = count_1 = sample_size//2
    elif len(label_0) < sample_size//2 and len(label_1) > sample_size//2:
        count_0 = len(label_0)
        count_1 = sample_size - count_0
    elif len(label_0) > sample_size//2 and len(label_1) < sample_size//2:
        count_1 = len(label_1)
        count_0 = sample_size - count_1
    else:
        count_0 = len(label_0)
        count_1 = len(label_1)

    print(f"count_0: {count_0}; count_1: {count_1}")
    print(f"label_0: {len(label_0)}; label_1: {len(label_1)}")

    train_size_0 = 1.0 if count_0 >= len(label_0) else float(count_0/len(label_0))
    train_size_1 = 1.0 if count_1 >= len(label_1) else float(count_1/len(label_1))

    print(f"train_size_0: {train_size_0}; train_size_1: {train_size_1}")
    
# Handle edge cases where train_size is 1.0 and stratification might fail
    if train_size_0 == 1.0:
        sample_0 = label_0.copy()  # Use the whole subset without train_test_split
    else:
        sample_0, _ = train_test_split(
            label_0,
            train_size=train_size_0,
            stratify=label_0['CSV_File_Number'] if group_col else None,
            random_state=42
        )

    if train_size_1 == 1.0:
        sample_1 = label_1.copy()  # Use the whole subset without train_test_split
    else:
        sample_1, _ = train_test_split(
            label_1,
            train_size=train_size_1,
            stratify=label_1['CSV_File_Number'] if group_col else None,
            random_state=42
        )   
    
    # Concatenate and shuffle the samples
    combined_sample = pd.concat([sample_0, sample_1])
    shuffled_sample = combined_sample.sample(frac=1, random_state=42).reset_index(drop=True)

    return shuffled_sample

In [6]:
# Create directory if it doesn't exist
# THis will create subdirectoy as well
def ensure_directory_exists(filepath):
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)
    return filepath

In [7]:
##Reading all the data
features = None


for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    locals()[client] = {}

for client, data in tqdm(dataset_sources.items(), total=len(dataset_sources)):
    client_dataframe = []
    for type, path in data.items():
        print(f' Loading....Clinet = {client}, type = {type}')
        dataframe = read_all_csv_files(path, features)
        #dataframe = sanitize_data_frames_updated(dataframe)
        if type == 'benign':
            dataframe['Label'] = 0
        else:
             dataframe['Label'] = 1
        client_dataframe.append(dataframe)
        #locals()[client][type] = dataframe
    client_merged_df = get_uniform_sample(pd.concat(client_dataframe, ignore_index=True), ['Label', 'CSV_File_Number'], 420000 )
    client_merged_df.to_csv(ensure_directory_exists(f'./dataset/{client}/{client}_original_dataset.csv'), index=False)
    

100%|█████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 67650.06it/s]
  0%|                                                                            | 0/4 [00:00<?, ?it/s]

 Loading....Clinet = client_1, type = benign
 Loading....Clinet = client_1, type = attack
count_0: 202821; count_1: 217179
label_0: 202821; label_1: 1048580
train_size_0: 1.0; train_size_1: 0.20711724427320757


 25%|█████████████████                                                   | 1/4 [00:27<01:21, 27.00s/it]

 Loading....Clinet = client_2, type = benign
 Loading....Clinet = client_2, type = attack
count_0: 185943; count_1: 234057
label_0: 185943; label_1: 1360150
train_size_0: 1.0; train_size_1: 0.1720817556887108


 50%|██████████████████████████████████                                  | 2/4 [00:57<00:58, 29.13s/it]

 Loading....Clinet = client_3, type = benign
 Loading....Clinet = client_3, type = attack
count_0: 210000; count_1: 210000
label_0: 223200; label_1: 1239407
train_size_0: 0.9408602150537635; train_size_1: 0.16943586731396548


 75%|███████████████████████████████████████████████████                 | 3/4 [01:28<00:30, 30.14s/it]

 Loading....Clinet = client_4, type = benign
 Loading....Clinet = client_4, type = attack
count_0: 210000; count_1: 210000
label_0: 246797; label_1: 1048576
train_size_0: 0.850901753262803; train_size_1: 0.2002716064453125


100%|████████████████████████████████████████████████████████████████████| 4/4 [01:57<00:00, 29.34s/it]


## 2.1 Train and Test Set Segregation
- Read datasets saved in Step 1
- Segregate Train and Test set and save locally. As we need to take decission based on csv file number and label, we will combine both column to stratify
- Sacling Training data save the scaled data and save the scalers as well
- Use saved scalers to scale the test data and save the data locally

In [8]:
feature_to_exclude = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp', 'CSV_File_Number', 'Label', 'Stratify']
output_features = ['Label']

In [9]:
#Iteration Friendly dictionary
clients = {
 
    'client_1': {
        'combined': './dataset/client_1/client_1_original_dataset.csv',
        'train': './dataset/client_1/fold_{}/client_1_train_dataset.csv',
        'test': './dataset/client_1/fold_{}/client_1_test_dataset.csv',
        'scaler': './dataset/client_1/fold_{}/client_1_train_scaler.pkl',
        'scaled_train': './dataset/client_1/fold_{}/client_1_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_1/fold_{}/client_1_sclaed_test_dataset.csv', #unwanted features removed
    },
    'client_2': {
        'combined': './dataset/client_2/client_2_original_dataset.csv',
        'train': './dataset/client_2/fold_{}/client_2_train_dataset.csv',
        'test': './dataset/client_2/fold_{}/client_2_test_dataset.csv',
        'scaler': './dataset/client_2/fold_{}/client_2_train_scaler.pkl',
        'scaled_train': './dataset/client_2/fold_{}/client_2_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_2/fold_{}/client_2_sclaed_test_dataset.csv', #unwanted features removed
    },
    'client_3': {
        'combined': './dataset/client_3/client_3_original_dataset.csv',
        'train': './dataset/client_3/fold_{}/client_3_train_dataset.csv',
        'test': './dataset/client_3/fold_{}/client_3_test_dataset.csv',
        'scaler': './dataset/client_3/fold_{}/client_3_train_scaler.pkl',
        'scaled_train': './dataset/client_3/fold_{}/client_3_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_3/fold_{}/client_3_sclaed_test_dataset.csv', #unwanted features removed
    },
    'client_4': {
        'combined': './dataset/client_4/client_4_original_dataset.csv',
        'train': './dataset/client_4/fold_{}/client_4_train_dataset.csv',
        'test': './dataset/client_4/fold_{}/client_4_test_dataset.csv',
        'scaler': './dataset/client_4/fold_{}/client_4_train_scaler.pkl',
        'scaled_train': './dataset/client_4/fold_{}/client_4_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_4/fold_{}/client_4_sclaed_test_dataset.csv', #unwanted features removed
    },

}

In [10]:
def scale_and_remove_unwanted_features(dataframe, feature_to_exclude, out_features, scaler_path, type='train'):

    features = dataframe.columns.values.tolist()
    input_features = [feature for feature in features if feature not in feature_to_exclude] 
    output_df = dataframe[out_features]
    input_df = dataframe[input_features]

    print(f"Input features shape: {input_df.shape}")  # Debugging
    print(f"Output (label) features shape: {output_df.shape}")  # Debugging

    # Reset index for both input and output DataFrames to ensure correct alignment
    input_df = input_df.reset_index(drop=True)
    output_df = output_df.reset_index(drop=True)
    
    if type == 'train':
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(input_df)
        scaled_df = pd.DataFrame(scaled_data, columns=input_df.columns)
        print(f"Scaled features shape: {scaled_df.shape}")  # Debugging
        
        merged_df = pd.concat([scaled_df, output_df], axis=1)
        print(f"Merged DataFrame shape: {merged_df.shape}")  # Debugging

        # Save the scaler to a file to be used for test set
        with open(scaler_path, "wb") as file:
            pickle.dump(scaler, file)
    else:
        with open(scaler_path, "rb") as file:
            scaler = pickle.load(file)
            scaled_data = scaler.transform(input_df)  # Changed from fit_transform to transform
            scaled_df = pd.DataFrame(scaled_data, columns=input_df.columns)
            print(f"Scaled features shape: {scaled_df.shape}")  # Debugging
            merged_df = pd.concat([scaled_df, output_df], axis=1)
            print(f"Merged DataFrame shape: {merged_df.shape}")  # Debugging
    
    return merged_df

In [11]:
num_splits = 5

In [12]:
for client, info in tqdm(clients.items(), total=len(clients)):
    print('------------------------------------------------------------------------------')
    print(f"Started for Client: {client}")
    df = pd.read_csv(info.get('combined'))
    #combinding label and csv file number to stratify
    df['Stratify'] = df['Label'].astype(str) + '_' + df['CSV_File_Number'].astype(str)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    #Loop through the splitted items:
    for fold, (train_index, test_index) in enumerate(skf.split(df, df['Stratify']), 1):
        print(f"Started for Split: {fold}")
        # Create training and test sets for the current fold
        train_df = df.iloc[train_index].reset_index(drop=True)
        test_df = df.iloc[test_index].reset_index(drop=True)    

        #checking if fold path exists. if not it will create one
        #ensure_directory_exists(info.get('train').format(fold))
        
        train_df.to_csv(ensure_directory_exists(info.get('train').format(fold)), index=False) ##It will contain all column with additional column
        test_df.to_csv(ensure_directory_exists(info.get('test').format(fold)), index=False) ##It will contain all column with additional column
    
        scaled_train_df = scale_and_remove_unwanted_features(train_df, feature_to_exclude, output_features, info.get('scaler').format(fold), 'train')
        #scaled_train_df.describe()
        scaled_train_df.to_csv(ensure_directory_exists(info.get('scaled_train')).format(fold), index=False)
    
        scaled_test_df = scale_and_remove_unwanted_features(test_df, feature_to_exclude, output_features, info.get('scaler').format(fold), 'test')
        #scaled_test_df.describe()
        scaled_test_df.to_csv(ensure_directory_exists(info.get('scaled_test').format(fold)), index=False)
        print(f"End for Client: {client}")
    
    
    

  0%|                                                                            | 0/4 [00:00<?, ?it/s]

------------------------------------------------------------------------------
Started for Client: client_1




Started for Split: 1
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_1
Started for Split: 2
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_1
Started for Split: 3
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (8400

 25%|████████████████▊                                                  | 1/4 [03:59<11:58, 239.59s/it]

End for Client: client_1
------------------------------------------------------------------------------
Started for Client: client_2




Started for Split: 1
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_2
Started for Split: 2
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_2
Started for Split: 3
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (8400

 50%|█████████████████████████████████▌                                 | 2/4 [08:15<08:18, 249.25s/it]

End for Client: client_2
------------------------------------------------------------------------------
Started for Client: client_3




Started for Split: 1
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_3
Started for Split: 2
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_3
Started for Split: 3
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (8400

 75%|██████████████████████████████████████████████████▎                | 3/4 [12:18<04:06, 246.52s/it]

End for Client: client_3
------------------------------------------------------------------------------
Started for Client: client_4




Started for Split: 1
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_4
Started for Split: 2
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (84000, 86)
End for Client: client_4
Started for Split: 3
Input features shape: (336000, 85)
Output (label) features shape: (336000, 1)
Scaled features shape: (336000, 85)
Merged DataFrame shape: (336000, 86)
Input features shape: (84000, 85)
Output (label) features shape: (84000, 1)
Scaled features shape: (84000, 85)
Merged DataFrame shape: (8400

100%|███████████████████████████████████████████████████████████████████| 4/4 [16:41<00:00, 250.35s/it]

End for Client: client_4





## 2.2 Analysis of Prepared Datasets

In [13]:
#pd.concat(client_dataframe, ignore_index=True)
intersted_features = ['Label', 'CSV_File_Number', 'Stratify']
train_df_all_clients = []
test_df_all_clients = []
for client, info in clients.items():
    print(f'Client: {client}')
    for fold in range(1, 6):
        train_df = pd.read_csv(info.get('train').format(fold))
        test_df = pd.read_csv(info.get('test').format(fold))

        train_df = train_df[intersted_features]
        test_df = test_df[intersted_features]
        
        train_df['client'] = client
        train_df['fold'] = fold
        
        test_df['client'] = client
        test_df['fold'] = fold

        train_df_all_clients.append(train_df)
        test_df_all_clients.append(test_df)
        
merged_train_df = pd.concat(train_df_all_clients, ignore_index=True)
merged_test_df = pd.concat(test_df_all_clients, ignore_index=True)


Client: client_1
Client: client_2
Client: client_3
Client: client_4


In [14]:
#print(merged_train_df.groupby(['client', 'fold', 'Stratify']).count())
grouped_counts = merged_train_df.groupby(['client', 'fold', 'Stratify']).size().reset_index(name='count')
# Pivoting the 'fold' column to make it into a separate column
pivot_df = grouped_counts.pivot_table(index=['client', 'Stratify'], columns='fold', values='count', fill_value=0).reset_index()
pivot_df.columns = ['Client', 'Stratify'] + [f'Fold_{int(col)}' for col in pivot_df.columns if isinstance(col, int)]
pivot_df.to_csv(ensure_directory_exists("./dataset/dataset_summary.csv"), index=False)
print(pivot_df.to_string())

      Client Stratify   Fold_1   Fold_2   Fold_3   Fold_4   Fold_5
0   client_1      0_0  82051.0  82051.0  82051.0  82051.0  82052.0
1   client_1      0_1  41618.0  41618.0  41619.0  41619.0  41618.0
2   client_1      0_2      3.0      3.0      2.0      2.0      2.0
3   client_1      0_3  38585.0  38584.0  38585.0  38585.0  38585.0
4   client_1      1_0  43436.0  43436.0  43436.0  43436.0  43436.0
5   client_1      1_1  43436.0  43436.0  43436.0  43436.0  43436.0
6   client_1      1_2  43436.0  43436.0  43436.0  43436.0  43436.0
7   client_1      1_3  43435.0  43436.0  43435.0  43435.0  43435.0
8   client_2      0_0  46682.0  46682.0  46682.0  46683.0  46683.0
9   client_2      0_1      3.0      3.0      2.0      2.0      2.0
10  client_2      0_2  83134.0  83134.0  83134.0  83135.0  83135.0
11  client_2      0_3  18935.0  18935.0  18936.0  18935.0  18935.0
12  client_2      1_0  36088.0  36088.0  36088.0  36088.0  36088.0
13  client_2      1_1  36088.0  36088.0  36088.0  36088.0  360

In [24]:
#print(merged_train_df.groupby(['client', 'fold', 'Stratify']).count())
grouped_counts = merged_test_df.groupby(['client', 'fold', 'Stratify']).size().reset_index(name='count')
# Pivoting the 'fold' column to make it into a separate column
pivot_df = grouped_counts.pivot_table(index=['client', 'Stratify'], columns='fold', values='count', fill_value=0).reset_index()
pivot_df.columns = ['Client', 'Stratify'] + [f'Fold_{int(col)}' for col in pivot_df.columns if isinstance(col, int)]
pivot_df.to_csv(ensure_directory_exists("./dataset/dataset_summary_test.csv"), index=False)
print(pivot_df.to_string())

      Client Stratify   Fold_1   Fold_2   Fold_3   Fold_4   Fold_5
0   client_1      0_0  20513.0  20513.0  20513.0  20513.0  20512.0
1   client_1      0_1  10405.0  10405.0  10404.0  10404.0  10405.0
2   client_1      0_2      0.0      0.0      1.0      1.0      1.0
3   client_1      0_3   9646.0   9647.0   9646.0   9646.0   9646.0
4   client_1      1_0  10859.0  10859.0  10859.0  10859.0  10859.0
5   client_1      1_1  10859.0  10859.0  10859.0  10859.0  10859.0
6   client_1      1_2  10859.0  10859.0  10859.0  10859.0  10859.0
7   client_1      1_3  10859.0  10858.0  10859.0  10859.0  10859.0
8   client_2      0_0  11671.0  11671.0  11671.0  11670.0  11670.0
9   client_2      0_1      0.0      0.0      1.0      1.0      1.0
10  client_2      0_2  20784.0  20784.0  20784.0  20783.0  20783.0
11  client_2      0_3   4734.0   4734.0   4733.0   4734.0   4734.0
12  client_2      1_0   9022.0   9022.0   9022.0   9022.0   9022.0
13  client_2      1_1   9022.0   9022.0   9022.0   9022.0   90

## 3. PCA dataset Preparation [Not in Use}
- Choose component of 30, 33, 35 and generate datasets accordingly for training dataset and store locally
- Use PCA matrix to convert test dataset and store locally

In [16]:
#Iteration Friendly dictionary
clients = {
 
    'client_1': {
        'scaled_train': './dataset/client_1/fold_{}/client_1_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_1/fold_{}/client_1_sclaed_test_dataset.csv', #unwanted features removed
        'pca_path': './dataset/client_1/fold_{}/pca/client_1_pca.pkl', #Saved for later use
        'pca_train': './dataset/client_1/fold_{}/pca/client_1_pca_train_dataset.csv', #unwanted features removed
        'pca_test': './dataset/client_1/fold_{}/pca/client_1_pca_test_dataset.csv', #unwanted features removed
    },
    'client_2': {
        'scaled_train': './dataset/client_2/fold_{}/client_2_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_2/fold_{}/client_2_sclaed_test_dataset.csv', #unwanted features removed
        'pca_path': './dataset/client_2/fold_{}/pca/client_2_pca.pkl', #Saved for later use
        'pca_train': './dataset/client_2/fold_{}/pca/client_2_pca_train_dataset.csv', #unwanted features removed
        'pca_test': './dataset/client_2/fold_{}/pca/client_2_pca_test_dataset.csv', #unwanted features removed
    },
    'client_3': {
        'scaled_train': './dataset/client_3/fold_{}/client_3_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_3/fold_{}/client_3_sclaed_test_dataset.csv', #unwanted features removed
        'pca_path': './dataset/client_3/fold_{}/pca/client_3_pca.pkl', #Saved for later use
        'pca_train': './dataset/client_3/fold_{}/pca/client_3_pca_train_dataset.csv', #unwanted features removed
        'pca_test': './dataset/client_3/fold_{}/pca/client_3_pca_test_dataset.csv', #unwanted features removed
    },
    'client_4': {
        'scaled_train': './dataset/client_4/fold_{}/client_4_scaled_train_dataset.csv', #unwanted features removed
        'scaled_test': './dataset/client_4/fold_{}/client_4_sclaed_test_dataset.csv', #unwanted features removed
        'pca_path': './dataset/client_4/fold_{}/pca/client_4_pca.pkl', #Saved for later use
        'pca_train': './dataset/client_4/fold_{}/pca/client_4_pca_train_dataset.csv', #unwanted features removed
        'pca_test': './dataset/client_4/fold_{}/pca/client_4_pca_test_dataset.csv', #unwanted features removed
    },
    
}

In [17]:
# Step 1: Function to perform PCA with a maximum number of components and save the PCA object
def perform_pca_and_save_with_max(train_df, test_df, max_components, pca_path):
    # Remove 'Label' column before applying PCA
    train_labels = train_df['Label']
    test_labels = test_df['Label']
    train_features = train_df.drop(columns=['Label'])
    test_features = test_df.drop(columns=['Label'])

    # Initialize PCA with the maximum number of components
    pca = PCA(n_components=max_components)
    
    # Fit PCA on the training set and transform
    train_pca_full = pca.fit_transform(train_features)

    ##Ensure direcoty exists. if not it will create one
    ensure_directory_exists(pca_path)
    
    # Save the PCA model for future use
    with open(pca_path, "wb") as f:
        pickle.dump(pca, f)
    
    #print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
    
    # Transform the test set using the same PCA model
    test_pca_full = pca.transform(test_features)
    
    return train_pca_full, test_pca_full, train_labels, test_labels

# Step 2: Function to slice PCA components and add labels back to the dataset
def add_labels_to_pca(train_pca_full, test_pca_full, train_labels, test_labels, num_components):
    # Slice the top 'num_components' from the full PCA results
    train_pca_reduced = train_pca_full[:, :num_components]
    test_pca_reduced = test_pca_full[:, :num_components]
    
    # Convert to DataFrame for easier handling
    train_pca_df = pd.DataFrame(train_pca_reduced, columns=[f'PC{i+1}' for i in range(num_components)])
    test_pca_df = pd.DataFrame(test_pca_reduced, columns=[f'PC{i+1}' for i in range(num_components)])
    
    # Add the 'Label' column back
    train_pca_df['Label'] = train_labels.values
    test_pca_df['Label'] = test_labels.values
    
    return train_pca_df, test_pca_df

In [18]:
# max_pca_components = 40 # This number chosen based on pca analysis. check PCA.ipynb

# for client, info in clients.items():
#     print("----------------------------------------------------------")
#     for fold in range(1, 6):
#         print(f'Client: {client} ... Fold: {fold}')
#         train_df = pd.read_csv(info.get('scaled_train').format(fold))
#         test_df = pd.read_csv(info.get('scaled_test').format(fold))
    
#         # Step 3: Perform PCA with the maximum number of components (e.g., 40)
#         train_pca_full, test_pca_full, train_labels, test_labels = perform_pca_and_save_with_max(train_df, test_df, max_components=max_pca_components, pca_path=info.get("pca_path").format(fold))      
#         train_pca_data, test_pca_data = add_labels_to_pca(train_pca_full, test_pca_full, train_labels, test_labels, num_components=max_pca_components)
        
#         train_pca_data.to_csv(info.get('pca_train').format(fold),  index=False)
#         test_pca_data.to_csv(info.get('pca_test').format(fold),  index=False)

#     print("-----------------------------------------------------------------------")

### Not in USE

In [19]:
# #Iteration Friendly dictionary
# clients = {
 
#     'client_1': {
#         'scaled_train': './dataset/client_1/client_1_scaled_train_dataset.csv', #unwanted features removed
#         'scaled_test': './dataset/client_1/client_1_sclaed_test_dataset.csv', #unwanted features removed
#         'pca_path': './dataset/client_1/pca/client_1_pca.pkl', #Saved for later use
#         'pca_train': './dataset/client_1/pca/client_1_pca_train_dataset.csv', #unwanted features removed
#         'pca_test': './dataset/client_1/pca/client_1_pca_test_dataset.csv', #unwanted features removed
#     },
#     'client_2': {
#         'scaled_train': './dataset/client_2/client_2_scaled_train_dataset.csv', #unwanted features removed
#         'scaled_test': './dataset/client_2/client_2_sclaed_test_dataset.csv', #unwanted features removed
#         'pca_path': './dataset/client_2/pca/client_2_pca.pkl', #Saved for later use
#         'pca_train': './dataset/client_2/pca/client_2_pca_train_dataset.csv', #unwanted features removed
#         'pca_test': './dataset/client_2/pca/client_2_pca_test_dataset.csv', #unwanted features removed
#     },
#     'client_3': {
#         'scaled_train': './dataset/client_3/client_3_scaled_train_dataset.csv', #unwanted features removed
#         'scaled_test': './dataset/client_3/client_3_sclaed_test_dataset.csv', #unwanted features removed
#         'pca_path': './dataset/client_3/pca/client_3_pca.pkl', #Saved for later use
#         'pca_train': './dataset/client_3/pca/client_3_pca_train_dataset.csv', #unwanted features removed
#         'pca_test': './dataset/client_3/pca/client_3_pca_test_dataset.csv', #unwanted features removed
#     },
#     'client_4': {
#         'scaled_train': './dataset/client_4/client_4_scaled_train_dataset.csv', #unwanted features removed
#         'scaled_test': './dataset/client_4/client_4_sclaed_test_dataset.csv', #unwanted features removed
#         'pca_path': './dataset/client_4/pca/client_4_pca.pkl', #Saved for later use
#         'pca_train': './dataset/client_4/pca/client_4_pca_train_dataset.csv', #unwanted features removed
#         'pca_test': './dataset/client_4/pca/client_4_pca_test_dataset.csv', #unwanted features removed
#     },
    
# }

In [20]:
# # Step 1: Function to perform PCA with a maximum number of components and save the PCA object
# def perform_pca_and_save_with_max(train_df, test_df, max_components, pca_path):
#     # Remove 'Label' column before applying PCA
#     train_labels = train_df['Label']
#     test_labels = test_df['Label']
#     train_features = train_df.drop(columns=['Label'])
#     test_features = test_df.drop(columns=['Label'])

#     # Initialize PCA with the maximum number of components
#     pca = PCA(n_components=max_components)
    
#     # Fit PCA on the training set and transform
#     train_pca_full = pca.fit_transform(train_features)
    
#     # Save the PCA model for future use
#     with open(pca_path, "wb") as f:
#         pickle.dump(pca, f)
    
#     print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
    
#     # Transform the test set using the same PCA model
#     test_pca_full = pca.transform(test_features)
    
#     return train_pca_full, test_pca_full, train_labels, test_labels

# # Step 2: Function to slice PCA components and add labels back to the dataset
# def add_labels_to_pca(train_pca_full, test_pca_full, train_labels, test_labels, num_components):
#     # Slice the top 'num_components' from the full PCA results
#     train_pca_reduced = train_pca_full[:, :num_components]
#     test_pca_reduced = test_pca_full[:, :num_components]
    
#     # Convert to DataFrame for easier handling
#     train_pca_df = pd.DataFrame(train_pca_reduced, columns=[f'PC{i+1}' for i in range(num_components)])
#     test_pca_df = pd.DataFrame(test_pca_reduced, columns=[f'PC{i+1}' for i in range(num_components)])
    
#     # Add the 'Label' column back
#     train_pca_df['Label'] = train_labels.values
#     test_pca_df['Label'] = test_labels.values
    
#     return train_pca_df, test_pca_df

In [21]:
pca_configurations = [
    {
        'n_components': 40,
        'train_path': "./dataset/{}/pca/components_{}/pca_{}_train.csv",
        'test_path': "./dataset/{}/pca/components_{}/pca_{}_test.csv",
    },
    {
        'n_components': 35,
        'train_path': "./dataset/{}/pca/components_{}/pca_{}_train.csv",
        'test_path': "./dataset/{}/pca/components_{}/pca_{}_test.csv",
    },
    {
        'n_components': 33,
        'train_path': "./dataset/{}/pca/components_{}/pca_{}_train.csv",
        'test_path': "./dataset/{}/pca/components_{}/pca_{}_test.csv",
    },
    {
        'n_components': 30,
        'train_path': "./dataset/{}/pca/components_{}/pca_{}_train.csv",
        'test_path': "./dataset/{}/pca/components_{}/pca_{}_test.csv",
    },
]

In [22]:
# for client, info in clients.items():
#     print("f{client} Starting....")
#     train_df = pd.read_csv(info.get('scaled_train'))
#     test_df = pd.read_csv(info.get('scaled_test'))
#     # Step 3: Perform PCA with the maximum number of components (e.g., 40)
#     train_pca_full, test_pca_full, train_labels, test_labels = perform_pca_and_save_with_max(train_df, test_df, max_components=40, pca_path=info.get("pca_path"))
    
    
#     # Step 4: Slice the PCA components and add labels back (e.g., for 40, 35, 33, 30 components)
#     for p in pca_configurations:        
#         train_pca_data, test_pca_data = add_labels_to_pca(train_pca_full, test_pca_full, train_labels, test_labels, num_components=p.get('n_components'))
#         train_pca_data.to_csv(p.get('train_path').format(client, p.get('n_components'), p.get('n_components')),  index=False)
#         test_pca_data.to_csv(p.get('test_path').format(client, p.get('n_components'), p.get('n_components')),  index=False)
#         print(train_pca_data.shape)
#     print("-----------------------------------------------------------------------")

In [23]:
  # This is just the full version