This implementation performs the following changes:

1. Drops instances where any of the features are null or NaN.
2. Uses a StringIndexer on the following features to convert them to numerical data. The feature name is changed to +'_indexed' when using this method.
  a. 'service'
  b. 'conn_state'
  c. 'history'
  d. 'proto'
  e. 'dest_ip_zeek'
  f. 'community_id'
  g. 'uid'
  h. 'src_ip_zeek'
3. The original feature columns are removed.
4. Drops any instances will null or NaN values.
5. Uses a StringIndexer on the class labels, 'label_tactic', to convert them to numerical data.
6. Uses a VectorAssembler on the new features.
7. Trains the SVM model using OVR.

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=cc525979228b5477955f842890aa66117b7f51ed42a44a2e49a2f00ab40b331a
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
# PySpark Imports
import pyspark
from pyspark.sql import SparkSession

# ML Classifier Imports
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import OneVsRest
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder

ModuleNotFoundError: No module named 'pyspark'

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("ce53") \
    .master("local") \
    .config("spark.driver.cores", "5") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", "4") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "4") \
    .config("spark.executor.instances", "2") \
.getOrCreate()

In [None]:
# Get the parquet files (current example is 2 from the website)
parquet_files = ["/content/part-00000-1da06990-329c-4e38-913a-0f0aa39b388d-c000.snappy.parquet", "/content/part-00000-df678a79-4a73-452b-8e72-d624b2732f17-c000.snappy.parquet"]

# Read the parquet files into a dataframe
df = spark.read.parquet(*parquet_files, inferSchema=True)

In [None]:
# print
df.printSchema()

root
 |-- resp_pkts: integer (nullable = true)
 |-- service: string (nullable = true)
 |-- orig_ip_bytes: integer (nullable = true)
 |-- local_resp: boolean (nullable = true)
 |-- missed_bytes: integer (nullable = true)
 |-- proto: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- conn_state: string (nullable = true)
 |-- dest_ip_zeek: string (nullable = true)
 |-- orig_pkts: integer (nullable = true)
 |-- community_id: string (nullable = true)
 |-- resp_ip_bytes: integer (nullable = true)
 |-- dest_port_zeek: integer (nullable = true)
 |-- orig_bytes: integer (nullable = true)
 |-- local_orig: boolean (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- history: string (nullable = true)
 |-- resp_bytes: integer (nullable = true)
 |-- uid: string (nullable = true)
 |-- src_port_zeek: integer (nullable = true)
 |-- ts: double (nullable = true)
 |-- src_ip_zeek: string (nullable = true)
 |-- label_tactic: string (nullable = true)



In [None]:
# Examine the unique values in the df
for col in df.columns:
  unique_values = df.select(col).distinct().collect()
  print(f"Unique values for column '{col}': {unique_values}")

NameError: name 'df' is not defined

In [None]:
# Remove instances with null values
df = df.dropna()

In [None]:
#Drop the datetime column
df = df.drop("datetime")

# Define columns to index
columns_to_index = ['service', 'conn_state', 'history', 'proto', 'dest_ip_zeek', 'community_id', 'uid', 'src_ip_zeek', 'label_tactic']

# Apply StringIndexer to each column
indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed").fit(df) for column in columns_to_index]

# Chain indexers together
pipeline = Pipeline(stages=indexers)

# Fit and transform the data
df_indexed = pipeline.fit(df).transform(df)

# Drop original columns
df_indexed = df_indexed.drop(*columns_to_index)

# Drop rows with any null values
df_indexed = df_indexed.dropna()

# Show the schema of the DataFrame
df_indexed.printSchema()

root
 |-- resp_pkts: integer (nullable = true)
 |-- orig_ip_bytes: integer (nullable = true)
 |-- local_resp: boolean (nullable = true)
 |-- missed_bytes: integer (nullable = true)
 |-- duration: double (nullable = true)
 |-- orig_pkts: integer (nullable = true)
 |-- resp_ip_bytes: integer (nullable = true)
 |-- dest_port_zeek: integer (nullable = true)
 |-- orig_bytes: integer (nullable = true)
 |-- local_orig: boolean (nullable = true)
 |-- resp_bytes: integer (nullable = true)
 |-- src_port_zeek: integer (nullable = true)
 |-- ts: double (nullable = true)
 |-- service_indexed: double (nullable = false)
 |-- conn_state_indexed: double (nullable = false)
 |-- history_indexed: double (nullable = false)
 |-- proto_indexed: double (nullable = false)
 |-- dest_ip_zeek_indexed: double (nullable = false)
 |-- community_id_indexed: double (nullable = false)
 |-- uid_indexed: double (nullable = false)
 |-- src_ip_zeek_indexed: double (nullable = false)
 |-- label_tactic_indexed: double (nulla

In [None]:
for col in df_indexed.columns:
  unique_values = df_indexed.select(col).distinct().collect()
  print(f"Unique values for column '{col}': {unique_values}")

Unique values for column 'resp_pkts': [Row(resp_pkts=76), Row(resp_pkts=27), Row(resp_pkts=12), Row(resp_pkts=1), Row(resp_pkts=52), Row(resp_pkts=16), Row(resp_pkts=20), Row(resp_pkts=40), Row(resp_pkts=54), Row(resp_pkts=48), Row(resp_pkts=64), Row(resp_pkts=154), Row(resp_pkts=15), Row(resp_pkts=2620), Row(resp_pkts=88), Row(resp_pkts=4), Row(resp_pkts=51), Row(resp_pkts=24), Row(resp_pkts=264604), Row(resp_pkts=60), Row(resp_pkts=32), Row(resp_pkts=56), Row(resp_pkts=228), Row(resp_pkts=33), Row(resp_pkts=2), Row(resp_pkts=0), Row(resp_pkts=18), Row(resp_pkts=36)]
Unique values for column 'orig_ip_bytes': [Row(orig_ip_bytes=6620), Row(orig_ip_bytes=858), Row(orig_ip_bytes=1896), Row(orig_ip_bytes=1352), Row(orig_ip_bytes=13504), Row(orig_ip_bytes=1160), Row(orig_ip_bytes=1068), Row(orig_ip_bytes=4683), Row(orig_ip_bytes=3936), Row(orig_ip_bytes=1892), Row(orig_ip_bytes=5691), Row(orig_ip_bytes=1708), Row(orig_ip_bytes=1056), Row(orig_ip_bytes=3456), Row(orig_ip_bytes=1290), Row(ori

In [None]:
# List of columns to assemble
columns_to_assemble = df_indexed.columns

# Remove the target column (label) if it's in the list
columns_to_assemble.remove('label_tactic_indexed')

# Create the VectorAssembler
assembler = VectorAssembler(inputCols=columns_to_assemble, outputCol="features")

# Transform the DataFrame
df_assembled = assembler.transform(df_indexed)

# Select only the features and label columns
df_assembled = df_assembled.select("features", "label_tactic_indexed")

# Show the schema of the DataFrame
df_assembled.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label_tactic_indexed: double (nullable = false)



In [None]:
# Split the data into training and test sets
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=1)

# Create the SVM model
svm = LinearSVC(labelCol="label_tactic_indexed", featuresCol="features", maxIter=10)

# One Vs. Rest
ovr = OneVsRest(classifier=svm, labelCol='label_tactic_indexed')

# Fit the model
svm_model = ovr.fit(train_data)

# Make predictions
predictions = svm_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9999774098086611


Changes made:

Used a StringIndexer on all columns (to include the classes) to change strings into numerical values.

Dropped the datetime column.

Ran OVR with SVM.

In [None]:
spark.sparkContext.stop()