In [1]:
# Import parquet file with pySpark
from pyspark.sql import SparkSession

# Create a spark session
spark = SparkSession.builder.appName("parquet").getOrCreate()

# Read parquet file
df = spark.read.parquet("ml_data_train.parquet")

# Show the data
df.show()


+------------------+--------------+--------------------------+--------------------+--------------------------+-----------------------+-----------------------+------------+
| src_subnet_hashed|dst_subnet_vec|scaled_bytes_pkts_features|scaled_flow_duration|scaled_throughput_features|scaled_pkt_len_features|scaled_tcp_win_features|attack_index|
+------------------+--------------+--------------------------+--------------------+--------------------------+-----------------------+-----------------------+------------+
|(1024,[906],[1.0])|(94,[0],[1.0])|      [-0.0513064471342...|[-0.4789529772200...|      [-0.0125662332574...|   [-0.4592399504915...|   [-0.4787111328395...|         1.0|
|(1024,[906],[1.0])|(94,[0],[1.0])|      [-0.0513064471342...|[-0.4789529772200...|      [-0.0125662332574...|   [-0.4592399504915...|   [-0.4787111328395...|         1.0|
|(1024,[906],[1.0])|(94,[0],[1.0])|      [-0.0517375603737...|[-0.4789529772200...|      [-0.0125662862983...|   [-0.4676085019167...|   [-0

#### Feature Scalling
Feature scaling can play a crucial role in optimizing the performance of machine learning models, especially those sensitive to the scale of input data.
Applying feature scaling to the right variables essential for improving model accuracy and efficiency. It ensures that each feature contributes equally to the decision-making process, preventing models from misinterpreting the data due to arbitrary feature scales. This leads to better, more reliable predictions in multiclass classification tasks.

##### Numerical Variables with Different Scales:

In [2]:
from pyspark.ml.feature import VectorAssembler

# Preparar os dados com VectorAssembler
feature_columns = ['scaled_bytes_pkts_features', 'scaled_flow_duration', 'scaled_throughput_features', 'scaled_pkt_len_features', 'scaled_tcp_win_features']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

In [3]:
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LogisticRegression, MultilayerPerceptronClassifier

# Definir o Random Forest
rf = RandomForestClassifier(labelCol="attack_index", featuresCol="features", numTrees=10)

# Definir o Naive Bayes
#nb = NaiveBayes(labelCol="attack_index", featuresCol="features")

# Definir o Logistic Regression
lr = LogisticRegression(labelCol="attack_index", featuresCol="features")

# Configuração do Multilayer Perceptron
# Obter o número de características
def get_feature_count(df, feature_col="features"):
    # Extrai os metadados da coluna de características e calcula a soma dos tamanhos dos atributos
    attributes = df.schema[feature_col].metadata["ml_attr"]["attrs"]
    feature_count = sum(len(attrs) for attrs in attributes.values())
    return feature_count
input_layers = get_feature_count(df_assembled, "features")
output_layers = df_assembled.select("attack_index").distinct().count()
hidden_layers = [input_layers, (input_layers + output_layers) // 2, output_layers]
mlp = MultilayerPerceptronClassifier(labelCol="attack_index", featuresCol="features", layers=hidden_layers, maxIter=100)

In [4]:
# Treinamento dos modelos
model_rf = rf.fit(df_assembled)
#model_nb = nb.fit(df_assembled)
model_lr = lr.fit(df_assembled)
model_mlp = mlp.fit(df_assembled)

In [5]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Avaliador
evaluator = MulticlassClassificationEvaluator(labelCol="attack_index", predictionCol="prediction", metricName="accuracy")

# Avaliar Random Forest
accuracy_rf = evaluator.evaluate(model_rf.transform(df_assembled))
print(f"Random Forest Accuracy: {accuracy_rf}")

# Avaliar Naive Bayes
#accuracy_nb = evaluator.evaluate(model_nb.transform(df_assembled))
#print(f"Naive Bayes Accuracy: {accuracy_nb}")

# Avaliar Logistic Regression
accuracy_lr = evaluator.evaluate(model_lr.transform(df_assembled))
print(f"Logistic Regression Accuracy: {accuracy_lr}")

# Avaliar Multilayer Perceptron
accuracy_mlp = evaluator.evaluate(model_mlp.transform(df_assembled))
print(f"Multilayer Perceptron Accuracy: {accuracy_mlp}")

Random Forest Accuracy: 0.8129
Logistic Regression Accuracy: 0.6209
Multilayer Perceptron Accuracy: 0.8125
