This implementation performs the following changes:

1. Reciprocal Transformation to normalize variables (w/nulls Dropped)



In [None]:
!pip install numpy
!pip install pyspark
!pip install pandas

In [4]:
#Basic Imports
import pyspark
from pyspark.sql import SparkSession

#ML Classifier Imports
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import OneVsRest
from pyspark.ml import Pipeline
from pyspark.sql.functions import when
import numpy as np
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import sqrt

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("rka7") \
    .master("local") \
    .config("spark.driver.cores", "5") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", "4") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "4") \
    .config("spark.executor.instances", "2") \
.getOrCreate()

In [48]:
# Get the parquet files (current example is 2 from the website)
parquet_files = ["/content/part-00000-23fdcfa3-9dd3-4c72-886c-e945bfcf92e1-c000.snappy.parquet", \
                 "/content/part-00000-9aeb279c-81c6-4481-9b30-d35d4d194fea-c000.snappy.parquet"]
# Read the parquet files into a dataframe
df = spark.read.parquet(*parquet_files, inferSchema=True)

In [None]:
# All service column is NULL
df = df.drop("service")

# Remove instances with null values
df = df.dropna()

df.printSchema()

In [50]:
# Convert timestamp column to string
df = df.withColumn("datetime_str", col("datetime").cast("string"))

#Drop the datetime column
df = df.drop("datetime")

# Define columns to index
columns_to_index = ['conn_state', 'history', 'proto', 'dest_ip_zeek', 'community_id', 'uid', 'src_ip_zeek', 'label_tactic', \
                    'label_binary', 'label_technique', 'datetime_str']

# Apply StringIndexer to each column
indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed").fit(df) for column in columns_to_index]

# Chain indexers together
pipeline = Pipeline(stages=indexers)

# Fit and transform the data
df_indexed = pipeline.fit(df).transform(df)

# Drop original columns
df_indexed = df_indexed.drop(*columns_to_index)

# Drop rows with any null values
df_indexed = df_indexed.dropna()

# Show the schema of the DataFrame
#df_indexed.printSchema()
#df.select("label_tactic").distinct().collect()

In [51]:
df_assembled = df_assembled.drop("datetime_str")

In [52]:
# List of columns to assemble
columns_to_assemble = df_indexed.columns

# Defining Numerical columns to Root Transform
transform_columns = ['resp_pkts', 'orig_ip_bytes', 'missed_bytes', 'duration', 'orig_pkts', \
               'dest_port_zeek', 'orig_bytes', 'resp_bytes', 'src_port_zeek']

# Perform Reciprocal Transformation to normalize Numerical columns
for column in transform_columns:
    df = df.withColumn(column, 1 / col(column))

# Numerical columns to add to Assembler
num_columns = ['resp_pkts', 'orig_ip_bytes', 'missed_bytes', 'duration', 'orig_pkts', \
               'dest_port_zeek', 'orig_bytes', 'local_orig', 'resp_bytes', 'src_port_zeek', 'local_resp', 'resp_ip_bytes', 'ts']

# Include Numerical columns to be assembled (Problems: 'local_resp', 'resp_ip_bytes', 'ts' is causing evaluations to overfit?)
columns_to_assemble.extend(num_columns)

# Remove the target column (label) if it's in the list
columns_to_assemble.remove('label_tactic_indexed')

# Create the VectorAssembler
assembler = VectorAssembler(inputCols=columns_to_assemble, outputCol="features")

# Transform the DataFrame
df_assembled = assembler.transform(df_indexed)

# Select only the features and label columns
df_assembled = df_assembled.select("features", "label_tactic_indexed")

# Show the schema of the DataFrame
#df_assembled.printSchema()

In [53]:
# Split the data into training and test sets
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=1)

# Create the SVM model
svm = LinearSVC(labelCol="label_tactic_indexed", featuresCol="features", maxIter=10)

# One Vs. Rest
ovr = OneVsRest(classifier=svm, labelCol='label_tactic_indexed')

# Fit the model
svm_model = ovr.fit(train_data)

# Make predictions
predictions = svm_model.transform(test_data)

In [54]:
#Creating Mulitple Evaluators for weighted precision, weighted recall, accuracy, weighted FPR
#eval_precision = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="weightedPrecision")
#eval_recall = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="weightedRecall")
eval_accuracy = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="accuracy")
#eval_fpr = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="weightedFalsePositiveRate")

# Evaluate the model
evaluator = eval_accuracy.evaluate(predictions)
print("Accuracy:", evaluator)

Accuracy: 0.4


Changes made:

Includes the Reciprocal Transformation on numerical variables.

Changes Noticed:

Including indexed datetime into features lowers accuracy by .2

In [17]:
spark.sparkContext.stop()