In [275]:
#Basic Imports
import pyspark
from pyspark.sql import SparkSession

#ML Classifier Imports
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import OneVsRest
from pyspark.ml import Pipeline

In [276]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("rka7") \
    .master("spark://192.168.1.2:7077") \
    .config("spark.driver.cores", "5") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", "4") \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", "true") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "2") \
    .config("spark.dynamicAllocation.maxExecutors", "4") \
    .config("spark.executor.instances", "2") \
.getOrCreate()

In [277]:
#spark.sparkContext.stop()
#sc.stop()

In [278]:
#sc = spark.sparkContext
#sc.getConf().getAll()

In [279]:
#Reading Parquet File
df = spark.read.parquet("hdfs://192.168.1.2:9000/rplenkers/benign_data/2023-10-06/part-00000-535a7dcb-d865-4577-b2a0-9c620a67d250-c000.snappy.parquet" \
, inferSchema=True)
#df.printSchema()

In [289]:
#Lots of mistakes here that need to have binning/preprocessing to work since values can be nullable and lots of string variables
#Transforming String type to numerical types

#stringCols = ['community_id', 'conn_state', 'history', 'src_ip_zeek', 'dest_ip_zeek', 'proto', 'service', 'uid', 'datetime', 'label_technique']
#indexer = StringIndexer(inputCol=stringCols, outputCol="indexed_column")
#df_transformed = indexer.fit(df).transform(df)

#df = df.drop("assembled_features")

featureCols = ['src_port_zeek', 'dest_port_zeek', 'local_orig', 'local_resp', 'missed_bytes', 'orig_bytes', 'orig_ip_bytes', 'orig_pkts', 'resp_ip_bytes', \
               'resp_pkts', 'ts', 'label_binary']

vectAssembler = VectorAssembler(inputCols=featureCols, outputCol="assembled_features")
df = vectAssembler.transform(df)
#df.printSchema()

In [281]:
svm = LinearSVC(maxIter=10, regParam=0.1)

In [282]:
ovr = OneVsRest(classifier=svm, labelCol='label_tactic')
df = df.drop("assembled_features")

In [283]:
#For now due to errors, dropping rows w/null values
df = df.dropna()

In [284]:
#Splitting Test Data to 70-30 Split
(training_data, test_data) = df.randomSplit([.8,.2], seed=1)

In [285]:
#Creating a Mulitple evaluators for weighted precision, weighted recall, accuracy, weighted FPR
eval_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision")
#eval_recall = MulticlassClassificationEvaluator(metricName="weightedRecall")
#eval_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")
#eval_fpr = MulticlassClassificationEvaluator(metricName="weightedFalsePositiveRate")


In [286]:
#Creating Pipeline
pipeline = Pipeline(stages=[vectAssembler, ovr])

In [288]:
#Fitting model on training data
model = pipeline.fit(training_data)

In [None]:
#Making the model predicitons based off the training data
predictions = model.transform(training_data)

In [None]:
#Evaluations
precision = eval_precision.evaluate(predictions)
print(f"Precision: {precision}")