In [2]:
# Importing required libraries
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler, LabelEncoder
from warnings import simplefilter
from datetime import datetime

# Suppress FutureWarning messages
simplefilter(action='ignore', category=FutureWarning)

# Record start time
start_time = time.time()

# List of CSV file names
all_files = [
    "Tuesday-WorkingHours.pcap_ISCX",
    "Wednesday-workingHours.pcap_ISCX",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX",
    "Friday-WorkingHours-Morning.pcap_ISCX",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX"
]

# Initialize lists for data and categorical columns
processed_dataframes = []
categorical_data = []
numeric_columns = None  # Will be defined after loading the first file

# Load and preprocess each file
for file_path in all_files:
    # Read CSV file
    df = pd.read_csv(f"C:/Users/PC/{file_path}.csv", encoding='iso-8859-2', engine='python')
    
    # Drop rows with missing Flow Duration values
    df = df.dropna(subset=[" Flow Duration"])
    
    # Replace infinite values with NaN, then drop NaN rows
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    
    # Set numeric columns and normalize (only on the first iteration)
    if numeric_columns is None:
        numeric_columns = df.select_dtypes(include='number').columns
        std_scaler = StandardScaler()
    
    # Append categorical columns for unified encoding later
    string_columns = [col for col in df.columns if df[col].dtype == "object"]
    if ' Label' in string_columns:
        string_columns.remove(' Label')  # Exclude the target column from encoding

    categorical_data.append(df[string_columns])
    df[numeric_columns] = df[numeric_columns].astype(np.float32)
    
    # Append to processed data
    processed_dataframes.append(df)
    print(f"{datetime.now()}: Preprocessing of file {file_path} is complete.")

# Concatenate and normalize the numeric columns in all data
combined_dataframe = pd.concat(processed_dataframes, ignore_index=True)
combined_dataframe[numeric_columns] = std_scaler.fit_transform(combined_dataframe[numeric_columns])

# Unified encoding of categorical columns
label_encoder = LabelEncoder()
for col in string_columns:
    combined_dataframe[col] = label_encoder.fit_transform(combined_dataframe[col].astype(str))

# Save the final processed data
combined_dataframe.to_csv("combined_data.csv", index=False)
print(f"{datetime.now()}: Concatenation and saving to CSV is complete.")
print(f"Total preprocessing time: {time.time() - start_time:.2f} seconds")

#adapted from https://github.com/yasakrami/Threat-Detection-in-Cyber-Security-Using-AI


2024-10-26 00:41:10.021355: Preprocessing of file Tuesday-WorkingHours.pcap_ISCX is complete.
2024-10-26 00:42:22.882226: Preprocessing of file Wednesday-workingHours.pcap_ISCX is complete.
2024-10-26 00:42:38.617067: Preprocessing of file Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX is complete.
2024-10-26 00:43:05.127604: Preprocessing of file Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX is complete.
2024-10-26 00:43:22.841571: Preprocessing of file Friday-WorkingHours-Morning.pcap_ISCX is complete.
2024-10-26 00:43:49.867722: Preprocessing of file Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX is complete.
2024-10-26 00:44:11.615518: Preprocessing of file Friday-WorkingHours-Afternoon-DDos.pcap_ISCX is complete.
2024-10-26 00:48:30.780324: Concatenation and saving to CSV is complete.
Total preprocessing time: 484.96 seconds
