In [1]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer

import pandas as pd
import numpy as np

import subprocess
import glob
import json
import os
import re
import gc

## Read PCAP files, Extract Network Features and Save to CSV Format

In [2]:
class PCAPReader:

    def __init__(self, pcap_path, feature_vector, tool, tshark_path, zeek_path):
        """
        Initializes the feature extraction process.

        Parameters:
            pcap_path (str): Path to the input pcap file.
            features (list): List of features/fields to extract.
            tshark_path (str): Path to the tshark executable (default is 'tshark').
        """
        self.pcap_path = pcap_path
        self.feature_vector = feature_vector
        self.tool = tool
        self.tshark_path = tshark_path
        self.zeek_path = zeek_path
        self.dataframe = None  # Will store the extracted DataFrame
        
    def to_dataframe(self):
        """
        Extracts features from the pcap file and returns them as a DataFrame.
        
        Returns:
            pd.DataFrame: DataFrame containing the extracted features.
        """
        fields = []
        for feature in self.feature_vector:
            fields += ["-e", feature]

        tshark_command = [
            self.tshark_path,
            "-n",                     # No DNS resolution (speeds up processing)
            "-r", self.pcap_path,     # Input pcap file
            "-T", "fields",           # Output in field format
            *fields,                  # Include all requested fields
            '-E', 'header=y',         # Add column headers
            "-E", "separator=\t",     # Use Tab as CSV delimiter
            '-E', 'occurrence=f',     # First occurrence of repeated fields
        ]
        
        try:
            # Run the tshark command and capture the output
            result = subprocess.run(tshark_command, capture_output=True, text=True, check=True)
            
            # Convert the output to a list of lists
            data = [line.split("\t") for line in result.stdout.strip().split("\n")]
            
            # Create a DataFrame from the data
            self.dataframe = pd.DataFrame(data, columns=self.feature_vector)
            return self.dataframe
        
        except subprocess.CalledProcessError as e:
            print(f"Error executing tshark: {e}")
            print("Ensure that tshark is correctly installed and accessible from the specified path.")
            return None

    def to_csv(self, output_file):
        """
        Saves the extracted features DataFrame to a CSV file.

        Parameters:
            output_csv (str): Path to save the CSV file.
        """
        fields = []
        for feature in self.feature_vector:
            fields += ["-e", feature]

        tshark_command = [
            self.tshark_path,
            "-n",                     # No DNS resolution (speeds up processing)
            "-r", self.pcap_path,     # Input pcap file
            "-T", "fields",           # Output in field format
            *fields,                  # Include all requested fields
            '-E', 'header=y',         # Add column headers
            "-E", "separator=\t",     # Use Tab as CSV delimiter
            '-E', 'occurrence=f',     # First occurrence of repeated fields
        ]
        
        try:
            with open(output_file, 'w') as out:
                subprocess.run(tshark_command, stdout=out)

            print(f"tshark parsing complete. File saved as: {output_file}")
        except subprocess.CalledProcessError as e:
            print(f"Error executing tshark: {e}")
            print("Ensure that tshark is correctly installed and accessible from the specified path.")
            return None

In [3]:
# Load features
with open("../protocol_fields_output_missing_values.json", "r") as file:
    feature_config = json.load(file)

features_to_extract = feature_config['features']
features = [feature['field'] for feature in features_to_extract]

In [4]:
def process_pcap_directory(input_dir, output_dir, features, is_malicious=False):
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".pcap"):
                pcap_file = os.path.join(root, file)

                # For malicious traffic, handle subfolder structure
                if is_malicious:
                    # Compute the relative path from the base directory (e.g., "malicious_dir")
                    rel_dir = os.path.relpath(root, input_dir)
    
                    # Correct output directory path by appending the relative directory structure
                    output_subdir = os.path.join(output_dir, rel_dir)
                else:
                    output_subdir = output_dir  # Normal traffic does not have subfolders

                os.makedirs(output_subdir, exist_ok=True)

                # Set output file path in the correct subdirectory
                output_file = os.path.join(output_subdir, file.replace(".pcap", ".csv"))
                
                print(pcap_file)
                pcapreader = PCAPReader(pcap_file, features, None, "tshark", None)
                pcapreader.to_csv(output_file)


# Input directories
normal_dir = "../data/raw/normal/"
malicious_dir = "../data/raw/malicious/"

# Output directories
output_dir_normal = "../data/extracted_features/normal/"
output_dir_malicious = "../data/extracted_features/malicious/"

# # Process normal traffic
# process_pcap_directory(
#     input_dir=normal_dir,
#     output_dir=output_dir_normal,
#     features=features,
#     is_malicious=False
# )

# # Process malicious traffic
# process_pcap_directory(
#     input_dir=malicious_dir,
#     output_dir=output_dir_malicious,
#     features=features,
#     is_malicious=True
# )

## Cleaning Pipeline

In [5]:
def remove_low_variance(df, threshold=0.01):
    """
    Removes features from the DataFrame with variance lower than the specified threshold.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        threshold (float): The variance threshold. Features with variance below this value will be removed.

    Returns:
        pd.DataFrame: DataFrame with low-variance features dropped.
    """
    # Select numerical columns for variance calculation
    numerical_df = df.select_dtypes(include=["number"])

    # Initialize the VarianceThreshold selector
    selector = VarianceThreshold(threshold=threshold)

    # Fit the selector to the data and identify features to drop
    selector.fit(numerical_df)
    to_drop = numerical_df.columns[~selector.get_support()]

    # Drop the low-variance features from the DataFrame
    return df.drop(columns=to_drop)


def replace_missing_values(df, num_replacement=-1, cat_replacement="unknown"):
    """
    Replaces missing values in a DataFrame.
    - Numerical columns: Replace with a specified value (default -1).
    - Categorical columns: Replace with a specified value (default "unknown").

    Parameters:
        df (pd.DataFrame): Input DataFrame with missing values.
        num_replacement (int/float): Value to replace missing numerical values (default is -1).
        cat_replacement (str): Value to replace missing categorical values (default is "unknown").

    Returns:
        pd.DataFrame: DataFrame with missing values replaced.
    """
    # Replace missing values in numerical columns with the specified replacement value
    num_cols = df.select_dtypes(include=["number"]).columns
    df[num_cols] = df[num_cols].fillna(num_replacement)

    # Replace missing values in categorical columns with the specified replacement value
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    df[cat_cols] = df[cat_cols].fillna(cat_replacement)

    return df


def remove_high_correlation(df, correlation_threshold=0.9):
    """
    Removes features that have a correlation greater than the specified threshold with any other feature.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        correlation_threshold (float): The correlation threshold. Features with correlation above this value will be dropped.

    Returns:
        pd.DataFrame: DataFrame with highly correlated features dropped.
    """
    # Select numerical columns to calculate the correlation matrix
    numerical_df = df.select_dtypes(include=["number"])

    # Calculate the absolute correlation matrix
    corr_matrix = numerical_df.corr().abs()

    # Identify the upper triangle of the correlation matrix (excluding the diagonal)
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Identify features to drop based on correlation threshold
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]

    # Drop the highly correlated features from the DataFrame
    return df.drop(columns=to_drop)


def handle_missing_values(df, missing_threshold=1.0):
    """
    Removes columns from the DataFrame that have missing values above a specified threshold.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        missing_threshold (float): The missing value threshold. Columns with a percentage of missing values greater than this will be dropped.

    Returns:
        pd.DataFrame: DataFrame with columns containing excessive missing values dropped.
    """
    # Calculate the percentage of missing values for each column
    missing_percentage = df.isnull().mean()

    # Remove columns with missing percentage above the threshold
    return df.loc[:, missing_percentage < missing_threshold]


def clean_features(df):
    """
    Full feature cleaning pipeline: applies multiple cleaning steps to the input DataFrame.
    
    The pipeline includes:
    1. Handling missing values by removing columns with too many missing values.
    2. Replacing missing values with specified replacements.
    3. Removing low variance features.
    4. Removing highly correlated features.

    Parameters:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: Cleaned DataFrame after applying all cleaning steps.
    """
    # Step 1: Remove columns with excessive missing values
    df = handle_missing_values(df)
    
    # Step 2: Replace missing values in the DataFrame
    df = replace_missing_values(df)
    
    # Step 3: Remove low-variance features
    df = remove_low_variance(df)
    
    # Step 4: Remove highly correlated features
    df = remove_high_correlation(df)
    
    return df

## Federated Feature Selection

In [15]:
def process_local_datasets(iot_devices):
    """
    Process and clean datasets for a list of IoT devices by reading feature files,
    cleaning them, and consolidating feature information for each device.

    Parameters:
        iot_devices (list): A list of IoT device identifiers to process.

    Returns:
        list: A list of sets representing the features for each device.
    """
    feature_info = []

    # Iterate over each IoT device
    for iot_device in iot_devices:
        # Get the list of file paths for normal and malicious data for the device
        m_filenames = [f for f in glob.glob(f"../data/extracted_features/malicious/*/{iot_device}_*.csv") if 'mirai-dos' in f]
        b_filenames = glob.glob(f"../data/extracted_features/normal/{iot_device}_*.csv")
        b_filenames = []
    
        # Read and concatenate the chunks from all the files associated with the device
        processed_chunks = []
        for filename in b_filenames + m_filenames:
            # Read each file in chunks to optimize memory usage
            for chunk in pd.read_csv(filename, sep="\t", low_memory=False, chunksize=10000):
                processed_chunks.append(chunk)

        if processed_chunks == []:
            continue
            
        # Concatenate all the chunks into a single DataFrame
        df = pd.concat(processed_chunks)
        
        # Perform local cleaning (e.g., handling missing values, transforming features, etc.)
        df_cleaned = clean_features(df)

        # Track the columns (features) in the cleaned data for this IoT device
        feature_info.append(set(df_cleaned.columns))

        # Free up memory by deleting the DataFrames and forcing garbage collection
        del df
        del df_cleaned
        gc.collect()

        print(f"IoT Device: {iot_device} Done!")

    return feature_info


def federated_feature_consolidation(feature_info):
    """
    Consolidate features across all devices by taking the union of features
    from each device to ensure global consistency of features.

    Parameters:
        feature_info (list): A list of sets representing features for each IoT device.

    Returns:
        list: A consolidated list of global features that are common across all devices.
    """
    # Intersect features across all devices to ensure global consistency
    global_features = set.union(*feature_info)
    return list(global_features)


def apply_global_features(cleaned_data, global_features):
    """
    Apply the global set of features to the cleaned data of each device.

    Parameters:
        cleaned_data (dict): A dictionary where keys are device names and values are DataFrames
                             containing the cleaned feature data for each device.
        global_features (list): A list of globally consistent feature names.

    Returns:
        dict: A dictionary with cleaned data for each device, containing only the global features.
    """
    for device, df_cleaned in cleaned_data.items():
        # Select only the columns that are part of the global feature set
        cleaned_data[device] = df_cleaned[global_features]
    return cleaned_data


def save_cleaned_data(cleaned_data, output_dir):
    """
    Save the cleaned data of each IoT device to CSV files in the specified output directory.

    Parameters:
        cleaned_data (dict): A dictionary where keys are device names and values are DataFrames
                             containing the cleaned feature data for each device.
        output_dir (str): The directory where the cleaned data CSV files will be saved.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Save each device's cleaned data as a CSV file
    for device, df in cleaned_data.items():
        df.to_csv(os.path.join(output_dir, f"{device}_cleaned.csv"), index=False)

In [16]:
benign_filenames = glob.glob("../data/extracted_features/normal/*.csv")

In [None]:
# Define paths to local datasets
iot_devices = list(set([re.search(r"([a-zA-Z\-]+)-([0-9]+)", f).group(0) for f in benign_filenames]))

# Step 1: Process each device's dataset locally
feature_info = process_local_datasets(iot_devices)

# Step 2: Consolidate features across all devices
m_global_features = federated_feature_consolidation(feature_info)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[num_cols] = df[num_cols].fillna(num_replacement)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cat_cols] = df[cat_cols].fillna(cat_replacement)


IoT Device: iotsim-ip-camera-street-1 Done!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[num_cols] = df[num_cols].fillna(num_replacement)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[cat_cols] = df[cat_cols].fillna(cat_replacement)


IoT Device: iotsim-air-quality-1 Done!


In [13]:
# benign_global_features = list(set.union(*feature_info))
# benign_global_features.sort()

print(len(global_features))

252


In [None]:
malicious_global_features = list(set.union(*feature_info))
malicious_global_features.sort()

print(len(malicious_global_features))

In [None]:
len(g_features)

In [None]:
g_features = list(set(malicious_global_features).union(set(benign_global_features)))

In [14]:
res = list(filter(lambda x: x['field'] in global_features, feature_config['features']))
res = {"features": res}
with open("../protocol_fields_output_cleeaned.json", "w") as file:
    json.dump(res, file)