In [1]:
import pandas as pd
import numpy as np

import glob

In [2]:
port_hierarchy_map_iot = [
    ([1883, 8883], "mqttPorts"),
    ([5683, 5684], "coapPorts"),
    ([8554, 8322, 8000, 8001, 8002, 8003, 1935, 8888], "rtspPorts"),
    ([80, 280, 443, 591, 593, 777, 488, 1183, 1184, 2069, 2301, 2381, 8008, 8080], "httpPorts"),
    ([24, 25, 50, 58, 61, 109, 110, 143, 158, 174, 209, 220, 406, 512, 585, 993, 995], "mailPorts"),
    ([42, 53, 81, 101, 105, 261], "dnsPorts"),
    ([20, 21, 47, 69, 115, 152, 189, 349, 574, 662, 989, 990], "ftpPorts"),
    ([22, 23, 59, 87, 89, 107, 211, 221, 222, 513, 614, 759, 992], "shellPorts"),
    ([512, 514], "remoteExecPorts"),
    ([13, 56, 113, 316, 353, 370, 749, 750], "authPorts"),
    ([229, 464, 586, 774], "passwordPorts"),
    ([114, 119, 532, 563], "newsPorts"),
    ([194, 258, 531, 994], "chatPorts"),
    ([35, 92, 170, 515, 631], "printPorts"),
    ([13, 37, 52, 123, 519, 525], "timePorts"),
    ([65, 66, 118, 150, 156, 217], "dbmsPorts"),
    ([546, 547, 647, 847], "dhcpPorts"),
    ([43, 63], "whoisPorts"),
    (range(137, 139 + 1), "netbiosPorts"),
    ([88, 748, 750], "kerberosPorts"),
    ([111, 121, 369, 530, 567, 593, 602], "RPCPorts"),
    ([161, 162, 391], "snmpPorts"),
    (range(0, 1024), "PRIVILEGED_PORTS"),
    (range(1024, 65536), "NONPRIVILEGED_PORTS")
]

def port_to_categories(port_map, port):
    """Convert port number to category according to port_map."""
    for p_range, p_name in port_map:
        if port in p_range:
            return p_name

    return ""

def _unpack_flags():
    df['ip.flags'] = df['ip.flags'].apply(lambda x: int(x, 16))
    ip_flags_array = df['ip.flags'].values.astype(np.uint8)
    ip_flags_array = np.unpackbits(ip_flags_array.reshape((-1, 1)), axis=1, bitorder='little')[:, :3]
    ip_flags_df = pd.DataFrame(ip_flags_array, columns=[f"ip_flag_{x}" for x in range(1, ip_flags_array.shape[1]+1)])

    df['tcp.flags'] = df['tcp.flags'].apply(lambda x: int(x, 16) if x != -1 else 0)
    tcp_flags_array = df['tcp.flags'].values.astype(np.uint8)
    tcp_flags_array = np.unpackbits(tcp_flags_array.reshape((-1, 1)), axis=1, bitorder='little')[:, :9]
    tcp_flags_df = pd.DataFrame(tcp_flags_array, columns=[f"tcp_flag_{x}" for x in range(1, tcp_flags_array.shape[1]+1)])

# Custom function to unpack flags
def unpack_flags(X):
    X = X.copy()
    
    # Unpack IP flags
    ip_flags = X['ip.flags'].apply(lambda x: int(x, 16)).values.astype(np.uint8)
    ip_flags = np.unpackbits(ip_flags.reshape((-1, 1)), axis=1, bitorder='little')[:, :3]
    
    # Unpack TCP flags (handle -1 for missing values)
    tcp_flags = X['tcp.flags'].apply(lambda x: int(x, 16) if x != -1 else 0).values.astype(np.uint8)
    tcp_flags = np.unpackbits(tcp_flags.reshape((-1, 1)), axis=1, bitorder='little')[:, :9]
    
    # Combine flags into a single output array
    return np.hstack([ip_flags, tcp_flags])

global_categorical_values = pd.DataFrame({
    "ip.protocol": ['TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP', 'UDP', 'ICMP', 'TCP'],
    "src.port": ['mqttPorts', 'coapPorts', 'rtspPorts', 'httpPorts', 'mailPorts', 'dnsPorts', 'ftpPorts', 'shellPorts', 'remoteExecPorts', 'authPorts', 'passwordPorts', 'newsPorts', 'chatPorts', 'printPorts', 'timePorts', 'dbmsPorts', 'dhcpPorts', 'whoisPorts', 'netbiosPorts', 'kerberosPorts', 'RPCPorts', 'snmpPorts', 'PRIVILEGED_PORTS', 'NONPRIVILEGED_PORTS', ''],
    "dst.port": ['mqttPorts', 'coapPorts', 'rtspPorts', 'httpPorts', 'mailPorts', 'dnsPorts', 'ftpPorts', 'shellPorts', 'remoteExecPorts', 'authPorts', 'passwordPorts', 'newsPorts', 'chatPorts', 'printPorts', 'timePorts', 'dbmsPorts', 'dhcpPorts', 'whoisPorts', 'netbiosPorts', 'kerberosPorts', 'RPCPorts', 'snmpPorts', 'PRIVILEGED_PORTS', 'NONPRIVILEGED_PORTS', ''],
})

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os

def preprocess(df):
    # Drop len columns
    df.drop(columns=['ip.len', 'tcp.len', 'udp.length'], axis=1, inplace=True)

    ################
    src_ports, dst_ports, protocols = [], [], []
    for idx, pkt in df.iterrows():
        if ':tcp' in pkt['frame.protocols']:
            protocol = 'TCP'
            src_port = int(pkt['tcp.srcport'])
            dst_port = int(pkt['tcp.dstport'])
        elif ':udp' in pkt['frame.protocols']:
            protocol = 'UDP'
            src_port = int(pkt['udp.srcport'])
            dst_port = int(pkt['udp.dstport'])
        elif ':icmp' in pkt['frame.protocols']:
            protocol = 'ICMP'
            src_port = np.nan
            dst_port = np.nan
        
        protocols.append(protocol)
        src_ports.append(src_port)
        dst_ports.append(dst_port)

    df['ip.protocol'] = protocols
    df['src.port'] = src_ports
    df['dst.port'] = dst_ports

    df.drop(columns=['ip.proto', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport'], axis=1, inplace=True)

    ################ 
    df['frame.time'] = df['frame.time'].str.replace('  ', ' ')
    df['frame.time'] = df['frame.time'].str.replace(' BST', '')
    df['frame.time'] = pd.to_datetime(df['frame.time'], format="%b %d, %Y %H:%M:%S.%f000")
    df['timestamp'] = df['frame.time'].values.astype(np.int64) // 10 ** 9

    ################ 
    df['src.port'] = df['src.port'].apply(lambda port: port_to_categories(port_hierarchy_map_iot, port))
    df['dst.port'] = df['dst.port'].apply(lambda port: port_to_categories(port_hierarchy_map_iot, port))

    ################
    # Convert the checksum to an integer, replacing missing values or invalid entries with a default.
    df['ip.checksum'] = df['ip.checksum'].apply(lambda x: int(str(x), 16) if pd.notna(x) else 0)
    df['tcp.checksum'] = df['tcp.checksum'].apply(lambda x: int(str(x), 16) if pd.notna(x) else 0)
    df['tcp.options'] = df['tcp.options'].apply(lambda x: int(str(x), 16) if pd.notna(x) else 0).astype(float)

    ################
    num_cols = df.select_dtypes(include=["number"]).columns
    df[num_cols] = df[num_cols].fillna(-1)

    cat_cols = df.select_dtypes(exclude=["number"]).columns
    df[cat_cols] = df[cat_cols].fillna(-1)

    ################
    selected_columns = ['timestamp', 'frame.len', 'ip.protocol', 'src.port', 'dst.port', 'ip.flags', 'ip.ttl', 'ip.checksum',
        'tcp.flags', 'tcp.window_size_value', 'tcp.window_size_scalefactor', 'tcp.checksum', 'tcp.options',
        'tcp.pdu.size', 'label']
    
    df['label'] = df['label'].str.replace('^C&C Communication', 'Mirai C&C Communication', regex=True)
    df = df[selected_columns]
    df = df[df['label'] != 'Unknown']

    return df

In [7]:
# filenames = glob.glob("../data/ready/*.csv")
# filenames.index('../data/ready/iotsim-building-monitor-1.csv')
# filenames[19:20]

['../data/ready/iotsim-building-monitor-1.csv']

In [5]:
import gc

filenames = glob.glob("../data/ready/*.csv")

for filename in filenames:
    
    processed_chunks = []
    print(filename)
    for chunk in pd.read_csv(filename, sep=",", low_memory=False, chunksize=10000):
        chunk = preprocess(chunk)
        processed_chunks.append(chunk)

    print("done")
    # Concatenate all the chunks into a single DataFrame
    df = pd.concat(processed_chunks)

    # Custom transformer
    flags_transformer = FunctionTransformer(unpack_flags)

    labels = df['label']
    features = df.drop(labels=['label'], axis=1)

    validation_size, testing_size = 0.2, 0.2
    X_train, X_test, y_train, y_test = train_test_split(
        features,
        labels,
        test_size=(validation_size + testing_size),
        random_state=42,
        stratify=labels
    )
    X_test, X_val, y_test, y_val = train_test_split(
        X_test,
        y_test,
        test_size=testing_size / (validation_size + testing_size),
        random_state=42
    )

    categorical_features = features.select_dtypes(exclude=["number"]).columns
    numeric_features = features.select_dtypes(exclude=[object]).columns

    preprocessor = ColumnTransformer(transformers=[
        ('flags', flags_transformer, ['ip.flags', 'tcp.flags']),
        ('categoricals', OneHotEncoder(drop='first', sparse_output=True, handle_unknown='error'), ['ip.protocol', 'src.port', 'dst.port']),
        ('numericals', StandardScaler(), numeric_features)
    ])

    preprocessor.fit(X_train)
    preprocessor['categoricals'].fit(global_categorical_values)

    # Preprocess the features
    X_train = pd.DataFrame(preprocessor.transform(X_train))
    X_val = pd.DataFrame(preprocessor.transform(X_val))
    X_test = pd.DataFrame(preprocessor.transform(X_test))

    # Preprocess the labels
    le = LabelEncoder()
    le.fit(pd.DataFrame({
        'label': ["Normal", "TCP Scan", "UDP Scan", "Telnet Brute Force", "Reporting", "Ingress Tool Transfer", "File Download", "CoAP Amplification", "Merlin TCP Flooding", "Merlin UDP Flooding", "Merlin ICMP Flooding", "Merlin C&C Communication", "Mirai TCP Flooding", "Mirai UDP Flooding", "Mirai GRE Flooding", "Mirai C&C Communication"]
    }))

    y_train = pd.DataFrame(le.transform(y_train), columns=["label"])
    y_val = pd.DataFrame(le.transform(y_val), columns=["label"])
    y_test = pd.DataFrame(le.transform(y_test), columns=["label"])

    iot_device = filename.rstrip('.csv').split('/')[-1]
    
    # Save the results
    X_train.to_pickle(os.path.join('..', 'data', 'final', f'{iot_device}_train_features.pkl'), compression='gzip')
    X_val.to_pickle(os.path.join('..', 'data', 'final', f'{iot_device}_val_features.pkl'), compression='gzip')
    X_test.to_pickle(os.path.join('..', 'data', 'final', f'{iot_device}_test_features.pkl'), compression='gzip')

    y_train.to_pickle(os.path.join('..', 'data', 'final', f'{iot_device}_train_labels.pkl'), compression='gzip')
    y_val.to_pickle(os.path.join('..', 'data', 'final', f'{iot_device}_val_labels.pkl'), compression='gzip')
    y_test.to_pickle(os.path.join('..', 'data', 'final', f'{iot_device}_test_labels.pkl'), compression='gzip')

    # Free up memory by deleting the DataFrames and forcing garbage collection
    del features
    del labels
    del df
    del X_train
    del y_train
    del X_val
    del y_val
    del X_test
    del y_test
    gc.collect()

../data/ready/iotsim-cooler-motor-15.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-14.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-13.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-8.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-9.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-12.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-10.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-8.csv
done
../data/ready/iotsim-hydraulic-system-9.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-cooler-motor-11.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-12.csv
done
../data/ready/iotsim-predictive-maintenance-8.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-ip-camera-museum-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-building-monitor-3.csv
done
../data/ready/iotsim-building-monitor-2.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-predictive-maintenance-9.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-13.csv
done
../data/ready/iotsim-hydraulic-system-11.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-ip-camera-museum-2.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-building-monitor-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-10.csv
done
../data/ready/iotsim-combined-cycle-9.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-hydraulic-system-14.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-ip-camera-street-2.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-building-monitor-5.csv
done
../data/ready/iotsim-combined-cycle-10.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-building-monitor-4.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-hydraulic-system-15.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-combined-cycle-8.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-ip-camera-street-1.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-6.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-stream-consumer-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-predictive-maintenance-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-7.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-5.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-stream-consumer-2.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-domotic-monitor-5.csv
done
../data/ready/iotsim-predictive-maintenance-2.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-predictive-maintenance-3.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-domotic-monitor-4.csv
done
../data/ready/iotsim-combined-cycle-4.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-predictive-maintenance-7.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-domotic-monitor-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-predictive-maintenance-6.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-3.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-air-quality-1.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-predictive-maintenance-4.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-domotic-monitor-3.csv
done
../data/ready/iotsim-domotic-monitor-2.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-predictive-maintenance-5.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-2.csv
done
../data/ready/iotsim-combined-cycle-tls-2.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-predictive-maintenance-10.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-4.csv
done
../data/ready/iotsim-cooler-motor-7.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-6.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-5.csv
done
../data/ready/iotsim-predictive-maintenance-11.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-tls-3.csv
done
../data/ready/iotsim-combined-cycle-tls-1.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-predictive-maintenance-13.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-7.csv
done
../data/ready/iotsim-cooler-motor-4.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-5.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-6.csv
done
../data/ready/iotsim-predictive-maintenance-12.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-combined-cycle-tls-4.csv
done
../data/ready/iotsim-hydraulic-system-2.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-cooler-motor-1.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-hydraulic-system-3.csv
done
../data/ready/iotsim-combined-cycle-tls-5.csv


  y = column_or_1d(y, warn=True)


done
../data/ready/iotsim-hydraulic-system-1.csv


  y = column_or_1d(y, warn=True)


done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-predictive-maintenance-15.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-2.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-cooler-motor-3.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-predictive-maintenance-14.csv
done


  y = column_or_1d(y, warn=True)


../data/ready/iotsim-city-power-1.csv
done


  y = column_or_1d(y, warn=True)


In [6]:
print("done")

done


In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer


class GothamDatasetPreprocessor(object):

    def __init__(self, data, training_size, validation_size, testing_size, num_replacement=-1, cat_replacement = "unknown"):
        self.data = data
        self.training_size = training_size
        self.validation_size = validation_size
        self.testing_size = testing_size
        
        self.features = None
        self.label = None
        self.num_replacement = num_replacement
        self.cat_replacement = cat_replacement
    
    def remove_columns(self):
        """"""
        # Columns to drop
        columns_to_drop = ['frame.protocols', 'eth.src', 'eth.dst', 'ip.dst', 'ip.src', 'ip.len', 'tcp.len', 'udp.length']

        # Remove duplicate rows
        self.data.drop(axis=1, columns=columns_to_drop, inplace=True)

    def remove_duplicate_values(self):
        """"""
        # Remove duplicate rows
        self.data.drop_duplicates(inplace=True, keep=False, ignore_index=True)

    def remove_missing_values(self):
        """"""
        # Remove missing values
        self.data.dropna(axis=0, inplace=True, how="any")

    def remove_infinite_values(self):
        """"""
        # Replace infinite values to NaN
        self.data.replace([-np.inf, np.inf], np.nan, inplace=True)

        # Remove infinte values
        self.data.dropna(axis=0, how='any', inplace=True)

    def clean_protocol_and_ports(self):
        """"""
        src_ports, dst_ports, protocols = [], [], []
        for idx, pkt in self.data.iterrows():
            if ':tcp' in pkt['frame.protocols']:
                protocol = 'TCP'
                src_port = int(pkt['tcp.srcport'])
                dst_port = int(pkt['tcp.dstport'])
            elif ':udp' in pkt['frame.protocols']:
                protocol = 'UDP'
                src_port = int(pkt['udp.srcport'])
                dst_port = int(pkt['udp.dstport'])
            elif ':icmp' in pkt['frame.protocols']:
                protocol = 'ICMP'
                src_port = np.nan
                dst_port = np.nan
            
            protocols.append(protocol)
            src_ports.append(src_port)
            dst_ports.append(dst_port)

        self.data['ip.protocol'] = protocols
        self.data['src.port'] = src_ports
        self.data['dst.port'] = dst_ports

        self.data.drop(columns=['ip.proto', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport'], axis=1, inplace=True)


    def clean_time_feature(self):
        """"""
        self.data['frame.time'] = self.data['frame.time'].str.replace('  ', ' ')
        self.data['frame.time'] = self.data['frame.time'].str.replace(' BST', '')
        self.data['frame.time'] = pd.to_datetime(self.data['frame.time'], format="%b %d, %Y %H:%M:%S.%f000")
        self.data['timestamp'] = self.data['frame.time'].values.astype(np.int64) // 10 ** 9


    def convert_port_to_categories(self):
        def port_to_categories(port_map, port):
            """Convert port number to category according to port_map."""
            for p_range, p_name in port_map:
                if port in p_range:
                    return p_name

            return ""

        df['src.port'] = df['src.port'].apply(lambda port: port_to_categories(port_hierarchy_map_iot, port))
        df['dst.port'] = df['dst.port'].apply(lambda port: port_to_categories(port_hierarchy_map_iot, port))
    
    def replace_missing_values(self):
        """
        Replaces missing values in a DataFrame.
        - Numerical columns: Replace with a specified value (default -1).
        - Categorical columns: Replace with a specified value (default "unknown").

        Parameters:
            df (pd.DataFrame): Input DataFrame with missing values.

        Returns:
            pd.DataFrame: DataFrame with missing values replaced.
        """
        num_cols = self.data.select_dtypes(include=["number"]).columns
        self.data[num_cols] = self.data[num_cols].fillna(self.num_replacement)

        cat_cols = self.data.select_dtypes(include=["object", "category"]).columns
        self.data[cat_cols] = self.data[cat_cols].fillna(self.cat_replacement)
    
    def train_valid_test_split(self):
        """."""
        self.labels = self.data['label']
        self.features = self.data.drop(labels=['label'], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(
            self.features,
            self.labels,
            test_size=(self.validation_size + self.testing_size),
            random_state=42,
            stratify=self.labels
        )
        X_test, X_val, y_test, y_val = train_test_split(
            X_test,
            y_test,
            test_size=self.testing_size / (self.validation_size + self.testing_size),
            random_state=42
        )
    
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)
    
    def scale(self, training_set, validation_set, testing_set):
        """"""
        (X_train, y_train), (X_val, y_val), (X_test, y_test) = training_set, validation_set, testing_set
        
        categorical_features = self.features.select_dtypes(exclude=["number"]).columns
        numeric_features = self.features.select_dtypes(exclude=[object]).columns

        preprocessor = ColumnTransformer(transformers=[
            ('categoricals', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='error'), categorical_features),
            ('numericals', StandardScaler(), numeric_features)
        ])

        # Preprocess the features
        columns = numeric_features.tolist()

        X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=columns, index=X_train.index)
        X_val = pd.DataFrame(preprocessor.transform(X_val), columns=columns, index=X_val.index)
        X_test = pd.DataFrame(preprocessor.transform(X_test), columns=columns, index=X_test.index)

        # Preprocess the labels
        le = LabelEncoder()

        y_train = pd.DataFrame(le.fit_transform(y_train), columns=["label"])
        y_val = pd.DataFrame(le.transform(y_val), columns=["label"])
        y_test = pd.DataFrame(le.transform(y_test), columns=["label"])

        return (X_train, y_train), (X_val, y_val), (X_test, y_test)

In [152]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer


class GothamDatasetPreprocessor(object):

    def __init__(self, data, training_size, validation_size, testing_size, num_replacement=-1, cat_replacement = "unknown"):
        self.data = data
        self.training_size = training_size
        self.validation_size = validation_size
        self.testing_size = testing_size
        
        self.features = None
        self.label = None
        self.num_replacement = num_replacement
        self.cat_replacement = cat_replacement
    
    def remove_columns(self):
        """"""
        # Columns to drop
        columns_to_drop = ['frame.protocols', 'eth.src', 'eth.dst', 'ip.dst', 'ip.src']

        # Remove duplicate rows
        self.data.drop(axis=1, columns=columns_to_drop, inplace=True)

    def remove_duplicate_values(self):
        """"""
        # Remove duplicate rows
        self.data.drop_duplicates(inplace=True, keep=False, ignore_index=True)

    def remove_missing_values(self):
        """"""
        # Remove missing values
        self.data.dropna(axis=0, inplace=True, how="any")

    def remove_infinite_values(self):
        """"""
        # Replace infinite values to NaN
        self.data.replace([-np.inf, np.inf], np.nan, inplace=True)

        # Remove infinte values
        self.data.dropna(axis=0, how='any', inplace=True)

    def remove_constant_features(self, threshold=0.01):
        """"""
        # Standard deviation denoted by sigma (σ) is the average of the squared root differences from the mean.
        data_std = self.data.std(numeric_only=True)

        # Find Features that meet the threshold
        constant_features = [column for column, std in data_std.items() if std < threshold]

        # Drop the constant features
        self.data.drop(labels=constant_features, axis=1, inplace=True)

    def remove_correlated_features(self, threshold=0.99):
        """"""
        # Correlation matrix
        numerical_df = self.data.select_dtypes(include=["number"])
        data_corr = numerical_df.corr()

        # Create & Apply mask
        mask = np.triu(np.ones_like(data_corr, dtype=bool))
        tri_df = data_corr.mask(mask)

        # Find Features that meet the threshold
        correlated_features = [c for c in tri_df.columns if any(tri_df[c] > threshold)]

        # Drop the highly correlated features
        self.data.drop(labels=correlated_features, axis=1, inplace=True)

    def replace_missing_values(self):
        """
        Replaces missing values in a DataFrame.
        - Numerical columns: Replace with a specified value (default -1).
        - Categorical columns: Replace with a specified value (default "unknown").

        Parameters:
            df (pd.DataFrame): Input DataFrame with missing values.

        Returns:
            pd.DataFrame: DataFrame with missing values replaced.
        """
        num_cols = self.data.select_dtypes(include=["number"]).columns
        self.data[num_cols] = self.data[num_cols].fillna(self.num_replacement)

        cat_cols = self.data.select_dtypes(include=["object", "category"]).columns
        self.data[cat_cols] = self.data[cat_cols].fillna(self.cat_replacement)
    
    def train_valid_test_split(self):
        """."""
        self.labels = self.data['label']
        self.features = self.data.drop(labels=['label'], axis=1)

        X_train, X_test, y_train, y_test = train_test_split(
            self.features,
            self.labels,
            test_size=(self.validation_size + self.testing_size),
            random_state=42,
            stratify=self.labels
        )
        X_test, X_val, y_test, y_val = train_test_split(
            X_test,
            y_test,
            test_size=self.testing_size / (self.validation_size + self.testing_size),
            random_state=42
        )
    
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)
    
    def scale(self, training_set, validation_set, testing_set):
        """"""
        (X_train, y_train), (X_val, y_val), (X_test, y_test) = training_set, validation_set, testing_set
        
        categorical_features = self.features.select_dtypes(exclude=["number"]).columns
        numeric_features = self.features.select_dtypes(exclude=[object]).columns

        preprocessor = ColumnTransformer(transformers=[
            ('categoricals', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='error'), categorical_features),
            ('numericals', StandardScaler(), numeric_features)
        ])

        # Preprocess the features
        columns = numeric_features.tolist()

        X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=columns, index=X_train.index)
        X_val = pd.DataFrame(preprocessor.transform(X_val), columns=columns, index=X_val.index)
        X_test = pd.DataFrame(preprocessor.transform(X_test), columns=columns, index=X_test.index)

        # Preprocess the labels
        le = LabelEncoder()

        y_train = pd.DataFrame(le.fit_transform(y_train), columns=["label"])
        y_val = pd.DataFrame(le.transform(y_val), columns=["label"])
        y_test = pd.DataFrame(le.transform(y_test), columns=["label"])

        return (X_train, y_train), (X_val, y_val), (X_test, y_test)