In [6]:
# Import parquet file with pySpark
from pyspark.sql import SparkSession

# Create a spark session
spark = SparkSession.builder.appName("parquet").getOrCreate()

# Read parquet file
df = spark.read.parquet("ml_data_train.parquet")

# Show the data
df.show()
df.printSchema()
print("Number of rows: ", df.count())


+------------------+--------------+--------------------------+--------------------+--------------------------+-----------------------+-----------------------+------------+
| src_subnet_hashed|dst_subnet_vec|scaled_bytes_pkts_features|scaled_flow_duration|scaled_throughput_features|scaled_pkt_len_features|scaled_tcp_win_features|attack_index|
+------------------+--------------+--------------------------+--------------------+--------------------------+-----------------------+-----------------------+------------+
|(1024,[906],[1.0])|(94,[0],[1.0])|      [-0.0513064471342...|[-0.4789529772200...|      [-0.0125662332574...|   [-0.4592399504915...|   [-0.4787111328395...|         1.0|
|(1024,[906],[1.0])|(94,[0],[1.0])|      [-0.0513064471342...|[-0.4789529772200...|      [-0.0125662332574...|   [-0.4592399504915...|   [-0.4787111328395...|         1.0|
|(1024,[906],[1.0])|(94,[0],[1.0])|      [-0.0517375603737...|[-0.4789529772200...|      [-0.0125662862983...|   [-0.4676085019167...|   [-0

#### Feature Scalling
Feature scaling can play a crucial role in optimizing the performance of machine learning models, especially those sensitive to the scale of input data.
Applying feature scaling to the right variables essential for improving model accuracy and efficiency. It ensures that each feature contributes equally to the decision-making process, preventing models from misinterpreting the data due to arbitrary feature scales. This leads to better, more reliable predictions in multiclass classification tasks.

##### Numerical Variables with Different Scales:

In [7]:
from pyspark.ml.feature import VectorAssembler

# Preparar os dados com VectorAssembler
feature_columns = ['scaled_bytes_pkts_features', 'scaled_flow_duration', 'scaled_throughput_features', 'scaled_pkt_len_features', 'scaled_tcp_win_features']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

In [8]:
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LogisticRegression, MultilayerPerceptronClassifier

# Definir o Random Forest
rf = RandomForestClassifier(labelCol="attack_index", featuresCol="features", numTrees=10)

# Definir o Naive Bayes
#nb = NaiveBayes(labelCol="attack_index", featuresCol="features")

# Definir o Logistic Regression
lr = LogisticRegression(labelCol="attack_index", featuresCol="features")

# Configuração do Multilayer Perceptron
# Obter o número de características
def get_feature_count(df, feature_col="features"):
    # Extrai os metadados da coluna de características e calcula a soma dos tamanhos dos atributos
    attributes = df.schema[feature_col].metadata["ml_attr"]["attrs"]
    feature_count = sum(len(attrs) for attrs in attributes.values())
    return feature_count
input_layers = get_feature_count(df_assembled, "features")
output_layers = df_assembled.select("attack_index").distinct().count()
hidden_layers = [input_layers, (input_layers + output_layers) // 2, output_layers]
mlp = MultilayerPerceptronClassifier(labelCol="attack_index", featuresCol="features", layers=hidden_layers, maxIter=100)

In [9]:
# Treinamento dos modelos
model_rf = rf.fit(df_assembled)
#model_nb = nb.fit(df_assembled)
model_lr = lr.fit(df_assembled)
model_mlp = mlp.fit(df_assembled)

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluator per l'accuratezza
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="accuracy"
)

# Evaluator per precision
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="weightedPrecision"
)

# Evaluator per recall
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="weightedRecall"
)

# Evaluator per f1 score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="attack_index", 
    predictionCol="prediction", 
    metricName="f1"
)

In [11]:
accuracy_rf = accuracy_evaluator.evaluate(model_rf.transform(df_assembled))
print(f"Random Forest Accuracy: {accuracy_rf}")
precision_rf = precision_evaluator.evaluate(model_rf.transform(df_assembled))
print(f"Random Forest Precision: {precision_rf}")
recall_rf = recall_evaluator.evaluate(model_rf.transform(df_assembled))
print(f"Random Forest Recall: {recall_rf}")
f1_rf = f1_evaluator.evaluate(model_rf.transform(df_assembled))
print(f"Random Forest F1 Score: {f1_rf}")

Random Forest Accuracy: 0.8434
Random Forest Precision: 0.8243855536746714
Random Forest Recall: 0.8433999999999999
Random Forest F1 Score: 0.8250611399869014


In [12]:
accuracy_lr = accuracy_evaluator.evaluate(model_lr.transform(df_assembled))
print(f"Logistic Regression Accuracy: {accuracy_lr}")
precision_lr = precision_evaluator.evaluate(model_lr.transform(df_assembled))
print(f"Logistic Regression Precision: {precision_lr}")
recall_lr = recall_evaluator.evaluate(model_lr.transform(df_assembled))
print(f"Logistic Regression Recall: {recall_lr}")
f1_lr = f1_evaluator.evaluate(model_lr.transform(df_assembled))
print(f"Logistic Regression F1 Score: {f1_lr}")

Logistic Regression Accuracy: 0.6209
Logistic Regression Precision: 0.6591741624901554
Logistic Regression Recall: 0.6209
Logistic Regression F1 Score: 0.5584675281195629


In [13]:
accuracy_mlp = accuracy_evaluator.evaluate(model_mlp.transform(df_assembled))
print(f"Multilayer Perceptron Accuracy: {accuracy_mlp}")
precision_mlp = precision_evaluator.evaluate(model_mlp.transform(df_assembled))
print(f"Multilayer Perceptron Precision: {precision_mlp}")
recall_mlp = recall_evaluator.evaluate(model_mlp.transform(df_assembled))
print(f"Multilayer Perceptron Recall: {recall_mlp}")
f1_mlp = f1_evaluator.evaluate(model_mlp.transform(df_assembled))
print(f"Multilayer Perceptron F1 Score: {f1_mlp}")

Multilayer Perceptron Accuracy: 0.8147
Multilayer Perceptron Precision: 0.8152272248573532
Multilayer Perceptron Recall: 0.8147000000000001
Multilayer Perceptron F1 Score: 0.8039290252069263
