In [2]:
# Import parquet file with pySpark
from pyspark.sql import SparkSession

# Create a spark session
spark = SparkSession.builder.appName("parquet").getOrCreate()

# Read parquet file
df = spark.read.parquet("ml_data_train.parquet")

# Show the data
df.show()


+------------------+--------------+--------------------------+--------------------+--------------------------+-----------------------+-----------------------+------------+
| src_subnet_hashed|dst_subnet_vec|scaled_bytes_pkts_features|scaled_flow_duration|scaled_throughput_features|scaled_pkt_len_features|scaled_tcp_win_features|attack_index|
+------------------+--------------+--------------------------+--------------------+--------------------------+-----------------------+-----------------------+------------+
|(1024,[906],[1.0])| (6,[0],[1.0])|      [-0.3340954290867...|[-0.5099706138072...|      [-0.0734753712719...|   [-0.4919874575015...|   [-0.5363250954674...|         1.0|
|(1024,[906],[1.0])| (6,[0],[1.0])|      [-0.3340954290867...|[-0.5099706138072...|      [-0.0734753712719...|   [-0.4919874575015...|   [-0.5363250954674...|         1.0|
|(1024,[906],[1.0])| (6,[0],[1.0])|      [-0.3375790522617...|[-0.5099706138072...|      [-0.0734771216598...|   [-0.4999859529900...|   [-0

#### Feature Scalling
Feature scaling can play a crucial role in optimizing the performance of machine learning models, especially those sensitive to the scale of input data.
Applying feature scaling to the right variables essential for improving model accuracy and efficiency. It ensures that each feature contributes equally to the decision-making process, preventing models from misinterpreting the data due to arbitrary feature scales. This leads to better, more reliable predictions in multiclass classification tasks.

##### Numerical Variables with Different Scales:

In [None]:
# Statistics summary for numerical variables candidates for scalling
columns_of_interest_for_scaling = [
    "IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS", 
    "FLOW_DURATION_MILLISECONDS", "SRC_TO_DST_SECOND_BYTES", 
    "DST_TO_SRC_SECOND_BYTES", "LONGEST_FLOW_PKT", "SHORTEST_FLOW_PKT", 
    "MIN_IP_PKT_LEN", "MAX_IP_PKT_LEN", "TCP_WIN_MAX_IN", "TCP_WIN_MAX_OUT"
]

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, FeatureHasher, OneHotEncoder
from pyspark.ml import Pipeline

# OneHot encoding for destination subnet since higher cardinality
dst_subnet_encoder = OneHotEncoder(inputCol="dst_subnet_index", outputCol="dst_subnet_vec")

# Source address
# Feature hashing for source subnet since lower cardinality
src_subnet_hasher = FeatureHasher(inputCols=["src_subnet_index"], outputCol="src_subnet_hashed", numFeatures=1024)

# Assemble numerical features into vectors
bytes_pkts_assembler = VectorAssembler(
    inputCols=["IN_BYTES", "OUT_BYTES", "IN_PKTS", "OUT_PKTS"],
    outputCol="bytes_pkts_features"
)

flow_duration_assembler = VectorAssembler(
    inputCols=["FLOW_DURATION_MILLISECONDS"],
    outputCol="flow_duration_feature"
)

throughput_assembler = VectorAssembler(
    inputCols=["SRC_TO_DST_SECOND_BYTES", "DST_TO_SRC_SECOND_BYTES"],
    outputCol="throughput_features"
)

pkt_len_assembler = VectorAssembler(
    inputCols=["LONGEST_FLOW_PKT", "SHORTEST_FLOW_PKT", "MIN_IP_PKT_LEN", "MAX_IP_PKT_LEN"],
    outputCol="pkt_len_features"
)

tcp_win_assembler = VectorAssembler(
    inputCols=["TCP_WIN_MAX_IN", "TCP_WIN_MAX_OUT"],
    outputCol="tcp_win_features"
)

# Apply StandardScaler to the assembled vectors
bytes_pkts_scaler = StandardScaler(
    inputCol="bytes_pkts_features", 
    outputCol="scaled_bytes_pkts_features", 
    withStd=True, 
    withMean=True
)

flow_duration_scaler = StandardScaler(
    inputCol="flow_duration_feature", 
    outputCol="scaled_flow_duration", 
    withStd=True, 
    withMean=True
)

throughput_scaler = StandardScaler(
    inputCol="throughput_features", 
    outputCol="scaled_throughput_features", 
    withStd=True, 
    withMean=True
)

pkt_len_scaler = StandardScaler(
    inputCol="pkt_len_features", 
    outputCol="scaled_pkt_len_features", 
    withStd=True, 
    withMean=True
)

tcp_win_scaler = StandardScaler(
    inputCol="tcp_win_features", 
    outputCol="scaled_tcp_win_features", 
    withStd=True, 
    withMean=True
)

# Create a pipeline for scaling
scaling_pipeline = Pipeline(stages=[
    dst_subnet_encoder,
    src_subnet_hasher,
    bytes_pkts_assembler, 
    bytes_pkts_scaler, 
    flow_duration_assembler, 
    flow_duration_scaler, 
    throughput_assembler, 
    throughput_scaler, 
    pkt_len_assembler, 
    pkt_len_scaler, 
    tcp_win_assembler, 
    tcp_win_scaler
])

# Fit the scalling pipeline to the encoded training data
scalling_model = scaling_pipeline.fit(encoded_train_data)

# Transform both encoded training and test data
processed_train_data = scalling_model.transform(encoded_train_data)
processed_test_data = scalling_model.transform(encoded_test_data)