In [1]:
# check if pyspark works
import pyspark
print(pyspark.__version__)

3.5.1


In [2]:
from pyspark.sql import SparkSession

# Start spark session
spark = SparkSession.builder \
    .appName("Multiclass classification IoT") \
    .getOrCreate()

# Load test data
test_data = spark.read.csv(r"./dataset/NF-ToN-IoT-v2-test.csv", header=True, inferSchema=True)

# Load train data
train_data = spark.read.csv(r"./dataset/NF-ToN-IoT-v2-train.csv", header=True, inferSchema=True)

In [3]:
# Display the first few rows of the training dataset to verify it's loaded correctly
train_data.show(5)

# Display the first few rows of the testing dataset to verify it's loaded correctly
test_data.show(5)

+-------------+-----------+-------------+-----------+--------+--------+--------+-------+---------+--------+---------+----------------+----------------+--------------------------+-----------+------------+-------+-------+----------------+-----------------+--------------+--------------+-----------------------+-----------------------+----------------------+---------------------+-----------------------+----------------------+-------------------------+-------------------------+------------------------+-------------------------+-------------------------+--------------------------+---------------------------+--------------+---------------+---------+--------------+------------+--------------+--------------+--------------------+-----+--------+
|IPV4_SRC_ADDR|L4_SRC_PORT|IPV4_DST_ADDR|L4_DST_PORT|PROTOCOL|L7_PROTO|IN_BYTES|IN_PKTS|OUT_BYTES|OUT_PKTS|TCP_FLAGS|CLIENT_TCP_FLAGS|SERVER_TCP_FLAGS|FLOW_DURATION_MILLISECONDS|DURATION_IN|DURATION_OUT|MIN_TTL|MAX_TTL|LONGEST_FLOW_PKT|SHORTEST_FLOW_PKT|MI

In [4]:
# Print the schema of the loaded data to confirm the data types of each column
train_data.printSchema()
test_data.printSchema()

root
 |-- IPV4_SRC_ADDR: string (nullable = true)
 |-- L4_SRC_PORT: integer (nullable = true)
 |-- IPV4_DST_ADDR: string (nullable = true)
 |-- L4_DST_PORT: integer (nullable = true)
 |-- PROTOCOL: integer (nullable = true)
 |-- L7_PROTO: double (nullable = true)
 |-- IN_BYTES: integer (nullable = true)
 |-- IN_PKTS: integer (nullable = true)
 |-- OUT_BYTES: integer (nullable = true)
 |-- OUT_PKTS: integer (nullable = true)
 |-- TCP_FLAGS: integer (nullable = true)
 |-- CLIENT_TCP_FLAGS: integer (nullable = true)
 |-- SERVER_TCP_FLAGS: integer (nullable = true)
 |-- FLOW_DURATION_MILLISECONDS: integer (nullable = true)
 |-- DURATION_IN: integer (nullable = true)
 |-- DURATION_OUT: integer (nullable = true)
 |-- MIN_TTL: integer (nullable = true)
 |-- MAX_TTL: integer (nullable = true)
 |-- LONGEST_FLOW_PKT: integer (nullable = true)
 |-- SHORTEST_FLOW_PKT: integer (nullable = true)
 |-- MIN_IP_PKT_LEN: integer (nullable = true)
 |-- MAX_IP_PKT_LEN: integer (nullable = true)
 |-- SRC_TO

### Data Preprocessing and Exploratory Data Analysis (EDA)

In [5]:
# Identifying missing values
from pyspark.sql.functions import col, count, when, isnan

# Count nulls and NaNs in each column
def count_missing(data):
    data.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data.columns]).show()

count_missing(train_data)

+-------------+-----------+-------------+-----------+--------+--------+--------+-------+---------+--------+---------+----------------+----------------+--------------------------+-----------+------------+-------+-------+----------------+-----------------+--------------+--------------+-----------------------+-----------------------+----------------------+---------------------+-----------------------+----------------------+-------------------------+-------------------------+------------------------+-------------------------+-------------------------+--------------------------+---------------------------+--------------+---------------+---------+--------------+------------+--------------+--------------+--------------------+-----+------+
|IPV4_SRC_ADDR|L4_SRC_PORT|IPV4_DST_ADDR|L4_DST_PORT|PROTOCOL|L7_PROTO|IN_BYTES|IN_PKTS|OUT_BYTES|OUT_PKTS|TCP_FLAGS|CLIENT_TCP_FLAGS|SERVER_TCP_FLAGS|FLOW_DURATION_MILLISECONDS|DURATION_IN|DURATION_OUT|MIN_TTL|MAX_TTL|LONGEST_FLOW_PKT|SHORTEST_FLOW_PKT|MIN_

In [6]:
# Identifying missing values
from pyspark.sql.functions import col, count, when, isnan

# Count nulls and NaNs in each column
def count_missing(data):
    data.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data.columns]).show()

count_missing(test_data)

+-------------+-----------+-------------+-----------+--------+--------+--------+-------+---------+--------+---------+----------------+----------------+--------------------------+-----------+------------+-------+-------+----------------+-----------------+--------------+--------------+-----------------------+-----------------------+----------------------+---------------------+-----------------------+----------------------+-------------------------+-------------------------+------------------------+-------------------------+-------------------------+--------------------------+---------------------------+--------------+---------------+---------+--------------+------------+--------------+--------------+--------------------+-----+------+
|IPV4_SRC_ADDR|L4_SRC_PORT|IPV4_DST_ADDR|L4_DST_PORT|PROTOCOL|L7_PROTO|IN_BYTES|IN_PKTS|OUT_BYTES|OUT_PKTS|TCP_FLAGS|CLIENT_TCP_FLAGS|SERVER_TCP_FLAGS|FLOW_DURATION_MILLISECONDS|DURATION_IN|DURATION_OUT|MIN_TTL|MAX_TTL|LONGEST_FLOW_PKT|SHORTEST_FLOW_PKT|MIN_

The dataset is clean and doesn't have any missing values, so there's no need for further cleaning.

#### Encoding
On the dataset, which consists of network traffic data, several categorical variables could potentially benefit from encoding. The decision to apply encoding techniques depends on whether the variables are nominal (without an inherent order) or ordinal (with a specific order) and whether they are used as features in the model. 

After an understanding of data and the variables meaning, we considered some variables for encoding:
- **IPV4_SRC_ADDR and IPV4_DST_ADDR** (Categorical nominal): These are IP addresses and typically should be treated as categorical.
- **Attack** (categorical Nominal): In machine learning projects involving classification tasks, the target variable (also known as the label or response variable) is crucial as it's the outcome the model is trying to predict. On the project, the target variable consists of categorical data (e.g., text labels representing different classes), it needs to be converted into a numeric format. This conversion is essential because most machine learning algorithms require numeric input to perform calculations during model training.


In [17]:
from pyspark.sql.functions import countDistinct

# Count distinct values in IPV4_SRC_ADDR
distinct_src_subnet_count = train_data.select(countDistinct("IPV4_SRC_ADDR").alias("Distinct_SRC_Count"))

# Count distinct values in IPV4_DST_ADDR
distinct_dst_subnet_count = train_data.select(countDistinct("IPV4_DST_ADDR").alias("Distinct_DST_Subnet"))

# Show the results
distinct_src_subnet_count.show(truncate=False)
distinct_dst_subnet_count.show(truncate=False)

+------------------+
|Distinct_SRC_Count|
+------------------+
|18538             |
+------------------+

+-------------------+
|Distinct_DST_Subnet|
+-------------------+
|6465               |
+-------------------+



Given the high number of distinct values in the IPV4_ADDR column, subnet segmentation was performed to facilitate more effective encoding. This approach reduces the granularity of the data, thereby simplifying the feature space without significantly compromising the informational value of the IP addresses. Segmentation enables us to manage the high cardinality of the IP addresses, which is critical for applying machine learning techniques efficiently and effectively. By categorizing the IP addresses into their respective subnets, we can capture essential network-level behaviors while avoiding the computational complexity associated with the vast number of unique full IP addresses.

In [11]:
from pyspark.sql.functions import concat_ws, split

# Add a new column for the subnet (first two octets) for source and destination IP addresses
train_data = train_data.withColumn(
    "IPV4_SRC_ADDR_Subnet",
    concat_ws(".", split(col("IPV4_SRC_ADDR"), "\\.")[0], split(col("IPV4_SRC_ADDR"), "\\.")[1])
)

train_data = train_data.withColumn(
    "IPV4_DST_ADDR_Subnet",
    concat_ws(".", split(col("IPV4_DST_ADDR"), "\\.")[0], split(col("IPV4_DST_ADDR"), "\\.")[1])
)

# Show the new columns along with the original IP addresses
train_data.select("IPV4_SRC_ADDR", "IPV4_SRC_ADDR_Subnet", "IPV4_DST_ADDR", "IPV4_DST_ADDR_Subnet").show(truncate=False)


+-------------+--------------------+--------------+--------------------+
|IPV4_SRC_ADDR|IPV4_SRC_ADDR_Subnet|IPV4_DST_ADDR |IPV4_DST_ADDR_Subnet|
+-------------+--------------------+--------------+--------------------+
|192.168.1.32 |192.168             |192.168.1.180 |192.168             |
|192.168.1.30 |192.168             |192.168.1.194 |192.168             |
|192.168.1.31 |192.168             |192.168.1.190 |192.168             |
|192.168.1.32 |192.168             |192.168.1.195 |192.168             |
|192.168.1.38 |192.168             |192.168.1.194 |192.168             |
|192.168.1.31 |192.168             |192.168.1.193 |192.168             |
|192.168.1.32 |192.168             |192.168.1.195 |192.168             |
|192.168.1.32 |192.168             |192.168.1.190 |192.168             |
|192.168.1.35 |192.168             |192.168.1.1   |192.168             |
|192.168.1.30 |192.168             |192.168.1.195 |192.168             |
|192.168.1.35 |192.168             |192.168.1.1   |

In [16]:
from pyspark.sql.functions import countDistinct

# Count distinct values in IPV4_SRC_ADDR_Subnet
distinct_src_subnet_count = train_data.select(countDistinct("IPV4_SRC_ADDR_Subnet").alias("Distinct_SRC_Subnet_Count"))

# Count distinct values in IPV4_DST_ADDR_Subnet
distinct_dst_subnet_count = train_data.select(countDistinct("IPV4_DST_ADDR_Subnet").alias("Distinct_DST_Subnet_Count"))

# Show the results
distinct_src_subnet_count.show(truncate=False)
distinct_dst_subnet_count.show(truncate=False)

+-------------------------+
|Distinct_SRC_Subnet_Count|
+-------------------------+
|15933                    |
+-------------------------+

+-------------------------+
|Distinct_DST_Subnet_Count|
+-------------------------+
|2832                     |
+-------------------------+



- **Source IP Dimensionality:** The modest reduction in unique source subnets compared to the full source IP count suggests that the network traffic originates from a diverse set of locations or devices, with slightly clustered but still quite spread out origins. This might imply that any predictive modeling using source subnets as a feature would still need to handle a relatively high number of categories, potentially requiring further techniques to manage dimensionality or enhance interpretability.

- **Destination IP Dimensionality:** The more significant reduction in unique destination subnets points to a higher level of concentration of network traffic towards certain destination networks or servers. For modeling purposes, this could mean that destination subnet could be a more impactful feature, providing stronger predictive signals with fewer categories, thus improving model performance and simplicity.

Given the results, for destination subnets, straightforward categorical encoding methods like one-hot encoding might be feasible given the reduced number of unique values. For source subnets, considering the still high number of unique categories, methods like feature hashing or embedding might be more appropriate to prevent models from becoming too complex.

In [18]:
# Encoding IPV4_DST_ADDR_Subnet
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

# Index the destination subnet column
dst_indexer = StringIndexer(inputCol="IPV4_DST_ADDR_Subnet", outputCol="DST_Subnet_Index")

# Apply OneHotEncoder to the indexed column
dst_encoder = OneHotEncoder(inputCols=["DST_Subnet_Index"], outputCols=["DST_Subnet_Encoded"])

# Build the pipeline
dst_pipeline = Pipeline(stages=[dst_indexer, dst_encoder])

# Fit and transform the data
dst_model = dst_pipeline.fit(train_data)
encoded_dst_data = dst_model.transform(train_data)

# Show some of the results to verify
encoded_dst_data.select("IPV4_DST_ADDR_Subnet", "DST_Subnet_Encoded").show(truncate=False)


+--------------------+------------------+
|IPV4_DST_ADDR_Subnet|DST_Subnet_Encoded|
+--------------------+------------------+
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|18.184              |(2831,[5],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
+--------------------+------------

In [19]:
# Encoding IPV4_SRC_ADDR_Subnet
from pyspark.ml.feature import FeatureHasher

# Apply FeatureHasher to the source subnet column
src_hasher = FeatureHasher(inputCols=["IPV4_SRC_ADDR_Subnet"], outputCol="SRC_Subnet_Hashed", numFeatures=1024)  # Number of features can be tuned

# Transform the data
hashed_src_data = src_hasher.transform(train_data)

# Show some of the results to verify
hashed_src_data.select("IPV4_SRC_ADDR_Subnet", "SRC_Subnet_Hashed").show(truncate=False)

+--------------------+------------------+
|IPV4_SRC_ADDR_Subnet|SRC_Subnet_Hashed |
+--------------------+------------------+
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
+--------------------+------------

In [20]:
#Encoding target variable

# Initialize the StringIndexer
indexer = StringIndexer(inputCol="Attack", outputCol="label")

# Fit the indexer to the data
model = indexer.fit(train_data)

# Transform the data to include the numeric labels
train_data = model.transform(train_data)

# Show some of the transformed data
train_data.select("Attack", "label").show(truncate=False)

+---------+-----+
|Attack   |label|
+---------+-----+
|scanning |1.0  |
|scanning |1.0  |
|Benign   |0.0  |
|scanning |1.0  |
|ddos     |3.0  |
|Benign   |0.0  |
|Benign   |0.0  |
|password |4.0  |
|xss      |2.0  |
|Benign   |0.0  |
|xss      |2.0  |
|Benign   |0.0  |
|dos      |5.0  |
|scanning |1.0  |
|injection|6.0  |
|password |4.0  |
|Benign   |0.0  |
|scanning |1.0  |
|Benign   |0.0  |
|dos      |5.0  |
+---------+-----+
only showing top 20 rows



To ensure the integrity of our model's evaluation, transformations such as the encodings for Attack, IPV4_SRC_ADDR_Subnet, and IPV4_DST_ADDR_Subnet are meticulously applied to the test set using a specific and consistent process. This process involves applying only the transformation rules and parameters learned from the training set to the test set, without re-fitting or adjusting these based on the test data. Such an approach prevents data leakage by ensuring that no information from the test set influences the transformation process. This standardized application of transformations guarantees that our model assessments are both reliable and replicable.

In [21]:
# Add a new column for the subnet (first two octets) for source and destination IP addresses in the test data
test_data = test_data.withColumn(
    "IPV4_SRC_ADDR_Subnet",
    concat_ws(".", split(col("IPV4_SRC_ADDR"), "\\.")[0], split(col("IPV4_SRC_ADDR"), "\\.")[1])
)

test_data = test_data.withColumn(
    "IPV4_DST_ADDR_Subnet",
    concat_ws(".", split(col("IPV4_DST_ADDR"), "\\.")[0], split(col("IPV4_DST_ADDR"), "\\.")[1])
)

# Show the new columns along with the original IP addresses to verify the transformation
test_data.select("IPV4_SRC_ADDR", "IPV4_SRC_ADDR_Subnet", "IPV4_DST_ADDR", "IPV4_DST_ADDR_Subnet").show(truncate=False)

+-------------+--------------------+--------------+--------------------+
|IPV4_SRC_ADDR|IPV4_SRC_ADDR_Subnet|IPV4_DST_ADDR |IPV4_DST_ADDR_Subnet|
+-------------+--------------------+--------------+--------------------+
|192.168.1.32 |192.168             |192.168.1.195 |192.168             |
|192.168.1.30 |192.168             |192.168.1.184 |192.168             |
|192.168.1.30 |192.168             |192.168.1.180 |192.168             |
|192.168.1.35 |192.168             |52.28.231.150 |52.28               |
|192.168.1.30 |192.168             |192.168.1.49  |192.168             |
|192.168.1.31 |192.168             |192.168.1.49  |192.168             |
|192.168.1.31 |192.168             |192.168.1.184 |192.168             |
|192.168.1.30 |192.168             |192.168.1.180 |192.168             |
|192.168.1.32 |192.168             |192.168.1.190 |192.168             |
|192.168.1.30 |192.168             |192.168.1.184 |192.168             |
|192.168.1.38 |192.168             |192.168.1.79  |

In [22]:
# Apply the destination subnet encoding to the test data
encoded_dst_test_data = dst_model.transform(test_data)

# Show some of the transformed test data to verify
encoded_dst_test_data.select("IPV4_DST_ADDR_Subnet", "DST_Subnet_Encoded").show(truncate=False)

+--------------------+------------------+
|IPV4_DST_ADDR_Subnet|DST_Subnet_Encoded|
+--------------------+------------------+
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|52.28               |(2831,[2],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
|192.168             |(2831,[0],[1.0])  |
+--------------------+------------

In [23]:
# Apply the FeatureHasher to the test data for source subnet
hashed_src_test_data = src_hasher.transform(test_data)

# Show some of the transformed test data to verify
hashed_src_test_data.select("IPV4_SRC_ADDR_Subnet", "SRC_Subnet_Hashed").show(truncate=False)

+--------------------+------------------+
|IPV4_SRC_ADDR_Subnet|SRC_Subnet_Hashed |
+--------------------+------------------+
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
|192.168             |(1024,[906],[1.0])|
+--------------------+------------

In [24]:
# Apply the StringIndexer transformation to the test data
test_data = model.transform(test_data)

# Show some of the transformed test data to verify
test_data.select("Attack", "label").show(truncate=False)

+--------+-----+
|Attack  |label|
+--------+-----+
|password|4.0  |
|ddos    |3.0  |
|scanning|1.0  |
|xss     |2.0  |
|Benign  |0.0  |
|Benign  |0.0  |
|ddos    |3.0  |
|scanning|1.0  |
|password|4.0  |
|ddos    |3.0  |
|Benign  |0.0  |
|Benign  |0.0  |
|ddos    |3.0  |
|Benign  |0.0  |
|scanning|1.0  |
|password|4.0  |
|ddos    |3.0  |
|Benign  |0.0  |
|xss     |2.0  |
|ddos    |3.0  |
+--------+-----+
only showing top 20 rows

