In [6]:
# Import parquet file with pySpark
from pyspark.sql import SparkSession

# Create a spark session
spark = spark = SparkSession.builder \
                            .appName("ML-part") \
                            .master("local[*]") \
                            .config("spark.driver.memory", "16g") \
                            .config("spark.executor.memory", "16g") \
                            .getOrCreate()
# Read parquet file
df = spark.read.parquet(r"./simplified_train_data.parquet")
df_test = spark.read.parquet(r"./simplified_test_data.parquet")

# Show the data
df.printSchema()
print(df.count())


root
 |-- attack_index: double (nullable = true)
 |-- scaled_ttl_features: vector (nullable = true)
 |-- scaled_tcp_flags_features: vector (nullable = true)
 |-- scaled_flow_features: vector (nullable = true)
 |-- scaled_duration_features: vector (nullable = true)
 |-- scaled_pkt_size_features: vector (nullable = true)
 |-- scaled_tcp_win_features: vector (nullable = true)
 |-- scaled_l4_dst_port_features: vector (nullable = true)

1000000


In [7]:
from pyspark.ml.feature import VectorAssembler

# Preparar os dados com VectorAssembler
feature_columns = df.columns
feature_columns.remove("attack_index")
print(feature_columns)
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

['scaled_ttl_features', 'scaled_tcp_flags_features', 'scaled_flow_features', 'scaled_duration_features', 'scaled_pkt_size_features', 'scaled_tcp_win_features', 'scaled_l4_dst_port_features']


In [None]:
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LogisticRegression, MultilayerPerceptronClassifier

# Definir o Random Forest
rf = RandomForestClassifier(labelCol="attack_index", featuresCol="features", numTrees=10)

# Definir o Naive Bayes
#nb = NaiveBayes(labelCol="attack_index", featuresCol="features")

# Definir o Logistic Regression
lr = LogisticRegression(labelCol="attack_index", featuresCol="features")

# Configuração do Multilayer Perceptron
# Obter o número de características
def get_feature_count(df, feature_col="features"):
# Extrai os metadados da coluna de características e calcula a soma dos tamanhos dos atributos
    attributes = df.schema[feature_col].metadata["ml_attr"]["attrs"]
    feature_count = sum(len(attrs) for attrs in attributes.values())
    return feature_count


input_layers = get_feature_count(df_assembled, "features")
output_layers = df_assembled.select("attack_index").distinct().count()
mlp = MultilayerPerceptronClassifier(
    labelCol="attack_index", 
    featuresCol="features", 
    layers=[input_layers, 9, 13, 11, output_layers],
    maxIter=100
)

In [9]:
# Random Forest
model_rf = rf.fit(df_assembled)
#model_nb = nb.fit(df_assembled)
# logistic regression
model_lr = lr.fit(df_assembled)
# Multilayer Perceptron
model_mlp = mlp.fit(df_assembled)

In [10]:
from unittest import result


result_rf = model_rf.transform(assembler.transform(df_test))
result_lr = model_lr.transform(assembler.transform(df_test))
result_mlp = model_mlp.transform(assembler.transform(df_test))

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluator per l'accuratezza
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="accuracy"
)

# Evaluator per precision
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="weightedPrecision"
)

# Evaluator per recall
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="weightedRecall"
)

# Evaluator per f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="f1"
)

In [12]:
accuracy_rf = accuracy_evaluator.evaluate(result_rf)
precision_rf = precision_evaluator.evaluate(result_rf)
recall_rf = recall_evaluator.evaluate(result_rf)
f1_rf = f1_evaluator.evaluate(result_rf)

print(f"Random Forest: Accuracy = {accuracy_rf}, Precision = {precision_rf}, Recall = {recall_rf}, F1 = {f1_rf}")

Random Forest: Accuracy = 0.887, Precision = 0.8724523986707202, Recall = 0.8870000000000001, F1 = 0.871351427985349


In [13]:
accuracy_lr = accuracy_evaluator.evaluate(result_lr)
precision_lr = precision_evaluator.evaluate(result_lr)
recall_lr = recall_evaluator.evaluate(result_lr)
f1_lr = f1_evaluator.evaluate(result_lr)

print(f"Logistic Regression: Accuracy = {accuracy_lr}, Precision = {precision_lr}, Recall = {recall_lr}, F1 = {f1_lr}")

Logistic Regression: Accuracy = 0.701, Precision = 0.6845894484890769, Recall = 0.7010000000000001, F1 = 0.6780689236669066


In [14]:
accuracy_mlp = accuracy_evaluator.evaluate(result_mlp)
precision_mlp = precision_evaluator.evaluate(result_mlp)
recall_mlp = recall_evaluator.evaluate(result_mlp)
f1_mlp = f1_evaluator.evaluate(result_mlp)

print(f"Multilayer Perceptron: Accuracy = {accuracy_mlp}, Precision = {precision_mlp}, Recall = {recall_mlp}, F1 = {f1_mlp}")

Multilayer Perceptron: Accuracy = 0.821, Precision = 0.778676984951985, Recall = 0.8210000000000001, F1 = 0.7939024417470336
