In [1]:
# PySpark Imports
import pyspark
from pyspark.sql import SparkSession

# ML Classifier Imports
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, PCA
from pyspark.ml.classification import OneVsRest
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.functions import mean, col
import time
import os
import sys

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("ce53") \
    .master("local[*]") \
    .config("spark.driver.cores", "2") \
    .config("spark.driver.memory", "6g") \
    .config("spark.executor.memory", "6g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "4") \
    .config("spark.executor.instances", "2") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.sql.execution.pythonUDF.arrow.enabled", "false") \
    .config("spark.executor.heartbeatInterval","1199s") \
    .config("spark.network.timeout","1200s") \
.getOrCreate()

In [4]:
parquet_files = ["Documents/School/Capstone Project/Local Running/Parquet/part-00000-df678a79-4a73-452b-8e72-d624b2732f17-c000.snappy.parquet"]

In [5]:
# Read the parquet files into a dataframe
df = spark.read.parquet(*parquet_files, inferSchema=True)

In [6]:
# Get unique labels and their counts
label_counts = df.groupBy("label_tactic").count().orderBy("label_tactic")

# Show the results
label_counts.show()

+--------------+-------+
|  label_tactic|  count|
+--------------+-------+
|     Discovery|   2086|
|Reconnaissance|9278720|
+--------------+-------+



In [7]:
# List of labels to drop
labels_to_drop = ["Defense Evasion", "Exfiltration", "Initial Access", "Lateral Movement", "Persistence", "Privilege Escalation", "Resource Development", "Credential Access"]

# Filter out the rows with labels to drop
df = df.filter(~col("label_tactic").isin(labels_to_drop))

# Get unique labels and their counts after filtering
filtered_label_counts = df.groupBy("label_tactic").count().orderBy("label_tactic")

# Show the filtered results
filtered_label_counts.show()


+--------------+-------+
|  label_tactic|  count|
+--------------+-------+
|     Discovery|   2086|
|Reconnaissance|9278720|
+--------------+-------+



In [8]:
# List of numeric column names
numeric_columns = ['resp_pkts', 'orig_ip_bytes', 'missed_bytes', 'duration', 'orig_pkts',
                   'resp_ip_bytes', 'dest_port_zeek', 'orig_bytes', 'resp_bytes',
                   'src_port_zeek', 'ts']

# Calculate mean for each numeric column
mean_values = df.select([mean(col(column)).alias(column) for column in numeric_columns]).collect()[0].asDict()

# Replace null or NaN values with mean
for column in numeric_columns:
    mean_value = mean_values[column]
    df = df.fillna({column: mean_value}, subset=[column])

# Show updated DataFrame
#df.show()

In [9]:
# Drop the datetime column
#df = df.drop("datetime")
df = df.withColumn("datetime", col("datetime").cast("string"))

# Define columns to index
columns_to_index = ['service', 'conn_state', 'history', 'proto', 'dest_ip_zeek', 'community_id', 'uid', 'src_ip_zeek', 'label_tactic', 'datetime']

# Impute null values with 'null' string
for column in columns_to_index:
    df = df.fillna('null', subset=[column])

# Apply StringIndexer to each column
indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed").fit(df) for column in columns_to_index]

# Chain indexers together
pipeline = Pipeline(stages=indexers)

# Fit and transform the data
df_indexed = pipeline.fit(df).transform(df)

# Drop original columns
df_indexed = df_indexed.drop(*columns_to_index)

# Show the schema of the DataFrame
df_indexed.show()

+---------+-------------+----------+------------+--------------------+---------+-------------+--------------+----------+----------+----------+-------------+-------------------+---------------+------------------+---------------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+----------------+
|resp_pkts|orig_ip_bytes|local_resp|missed_bytes|            duration|orig_pkts|resp_ip_bytes|dest_port_zeek|orig_bytes|local_orig|resp_bytes|src_port_zeek|                 ts|service_indexed|conn_state_indexed|history_indexed|proto_indexed|dest_ip_zeek_indexed|community_id_indexed|uid_indexed|src_ip_zeek_indexed|label_tactic_indexed|datetime_indexed|
+---------+-------------+----------+------------+--------------------+---------+-------------+--------------+----------+----------+----------+-------------+-------------------+---------------+------------------+---------------+-------------+--------------------+--------------------+---------

In [10]:
# List of columns to assemble
columns_to_assemble = df_indexed.columns

# Remove the target column (label) if it's in the list
columns_to_assemble.remove('label_tactic_indexed')

# Create the VectorAssembler
assembler = VectorAssembler(inputCols=columns_to_assemble, outputCol="features")

# Transform the DataFrame
df_assembled = assembler.transform(df_indexed)

# Select only the features and label columns
df_assembled = df_assembled.select("features", "label_tactic_indexed")

# Show the schema of the DataFrame
df_assembled.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label_tactic_indexed: double (nullable = false)



In [11]:
# Get the distinct values of the 'label_tactic_indexed' column
distinct_classes = df_assembled.select("label_tactic_indexed").distinct().collect()

# Convert the result to a list of unique classes
unique_classes = [row['label_tactic_indexed'] for row in distinct_classes]

# Print the unique classes
print("Unique classes:", unique_classes)

# Group by the 'label_tactic_indexed' column and count the occurrences of each class
class_counts = df_assembled.groupBy("label_tactic_indexed").count()

# Show the class counts
class_counts.show()

Unique classes: [0.0, 1.0]
+--------------------+-------+
|label_tactic_indexed|  count|
+--------------------+-------+
|                 0.0|9278720|
|                 1.0|   2086|
+--------------------+-------+



In [12]:
#Standardize data
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="features_normalized", withMean=True, withStd=True)
scaler_model = scaler.fit(df_assembled)
df_normalized = scaler_model.transform(df_assembled)
df_normalized = df_normalized.select("features_normalized", "label_tactic_indexed")
df_normalized.printSchema()

root
 |-- features_normalized: vector (nullable = true)
 |-- label_tactic_indexed: double (nullable = false)



In [13]:
# Split the data into training and test sets
train_data, test_data = df_normalized.randomSplit([0.7, 0.3], seed=42)

# Define the PCA model
pca = PCA(k=3, inputCol="features_normalized", outputCol="pca_features")

# Fit the PCA model
start_time = time.time()

pca_model = pca.fit(train_data)

end_time = time.time()
execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

# Apply PCA transformation to the assembled DataFrame
train_pca = pca_model.transform(train_data)
test_pca = pca_model.transform(test_data)

Execution time: 240.96038269996643 seconds


In [14]:
# Drop the normalized column
train_pca = train_pca.drop("features_normalized")

# Rename the pca_features column to features
train_pca = train_pca.withColumnRenamed("pca_features", "features")

# Verify the changes
train_pca.show()

+--------------------+--------------------+
|label_tactic_indexed|            features|
+--------------------+--------------------+
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.980151

In [15]:
# Drop the normalized column
test_pca = test_pca.drop("features_normalized")

# Rename the pca_features column to features
test_pca = test_pca.withColumnRenamed("pca_features", "features")

# Verify the changes
test_pca.show()

+--------------------+--------------------+
|label_tactic_indexed|            features|
+--------------------+--------------------+
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.9801512862646...|
|                 0.0|[-3.980151

In [16]:
# Create the SVM model

#featuresCol: str = ‘features’, labelCol: str = ‘label_bin’,
#predictionCol: str = ‘prediction’, maxIter: int = 100, regParam:
#float = 0.0, tol: float = 1 10􀀀6, rawPredictionCol: str =
#‘rawPrediction’, fitIntercept: bool = True, standardization: bool
#= True, threshold: float = 0.0, weightCol: Optional[str] = None,
#aggregationDepth: int = 2, maxBlockSizeInMB: float = 0.0
#svm = LinearSVC(labelCol="label_tactic_indexed", featuresCol="features", predictionCol='prediction', maxIter=10, regParam=0.0, tol=.000001, fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, aggregationDepth=2, maxBlockSizeinMB=0.0)
svm = LinearSVC(labelCol="label_tactic_indexed", featuresCol="features", maxIter=1)

# One Vs. Rest
ovr = OneVsRest(classifier=svm, labelCol='label_tactic_indexed')

# Fit the model
start_time = time.time()

svm_model = ovr.fit(train_pca)

end_time = time.time()
execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

Execution time: 263.77560234069824 seconds


In [17]:
# Get the param map
param_map = svm_model.extractParamMap()

# Print the param map
for param, value in param_map.items():
    print(param.name, ":", value)

predictionCol : prediction
featuresCol : features
labelCol : label_tactic_indexed
rawPredictionCol : rawPrediction
classifier : LinearSVC_6d51a083cdf2


In [18]:
# Make predictions
start_time = time.time()

predictions = svm_model.transform(test_pca)

end_time = time.time()
execution_time = end_time - start_time
print("Execution time:", execution_time, "seconds")

Execution time: 0.23307490348815918 seconds


In [19]:
# Evaluate the model
# Calculate accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="accuracy")
accuracy = evaluator_accuracy.evaluate(predictions)

# Calculate precision
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)

# Calculate recall
evaluator_recall = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)

# Calculate F1-score
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label_tactic_indexed", metricName="f1")
f1_score = evaluator_f1.evaluate(predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "c:\users\eller\appdata\local\programs\python\python39\lib\socket.py", line 707, in readinto
    raise
socket.timeout: timed out


In [32]:
# Check Python version
import sys
print("Python Version:", sys.version)

# Check PySpark version
import pyspark
print("PySpark Version:", pyspark.__version__)

import subprocess

def get_java_version():
    try:
        output = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT)
        output = output.decode().strip().split('\n')[0]  # Extracting the first line
        version = output.split()[2].replace('"', '')
        return version
    except Exception as e:
        print("Error:", e)
        return None

java_version = get_java_version()
print("Java version:", java_version)

Python Version: 3.9.0 (tags/v3.9.0:9cf6752, Oct  5 2020, 15:34:40) [MSC v.1927 64 bit (AMD64)]
PySpark Version: 3.3.1
Java version: 1.8.0_401


In [23]:
spark.sparkContext.stop()