In [5]:
# Import parquet file with pySpark
from pyspark.sql import SparkSession

# Create a spark session
spark = SparkSession.builder.appName("parquet").getOrCreate()

# Read parquet file
df = spark.read.parquet(r"./simplified_train_data.parquet")

# Show the data
df.printSchema()


root
 |-- attack_index: double (nullable = true)
 |-- scaled_ttl_features: vector (nullable = true)
 |-- scaled_tcp_flags_features: vector (nullable = true)
 |-- scaled_flow_features: vector (nullable = true)
 |-- scaled_duration_features: vector (nullable = true)
 |-- scaled_pkt_size_features: vector (nullable = true)
 |-- scaled_tcp_win_features: vector (nullable = true)
 |-- scaled_protocol_features: vector (nullable = true)
 |-- scaled_l4_dst_port_features: vector (nullable = true)



In [6]:
from pyspark.ml.feature import VectorAssembler

# Preparar os dados com VectorAssembler
feature_columns = df.columns
feature_columns.remove("attack_index")
print(feature_columns)
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df_assembled = assembler.transform(df)

['scaled_ttl_features', 'scaled_tcp_flags_features', 'scaled_flow_features', 'scaled_duration_features', 'scaled_pkt_size_features', 'scaled_tcp_win_features', 'scaled_protocol_features', 'scaled_l4_dst_port_features']


In [7]:
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LogisticRegression, MultilayerPerceptronClassifier

# Definir o Random Forest
rf = RandomForestClassifier(labelCol="attack_index", featuresCol="features", numTrees=5)

# Definir o Naive Bayes
#nb = NaiveBayes(labelCol="attack_index", featuresCol="features")

# Definir o Logistic Regression
lr = LogisticRegression(labelCol="attack_index", featuresCol="features")

# Configuração do Multilayer Perceptron
# Obter o número de características
def get_feature_count(df, feature_col="features"):
# Extrai os metadados da coluna de características e calcula a soma dos tamanhos dos atributos
    attributes = df.schema[feature_col].metadata["ml_attr"]["attrs"]
    feature_count = sum(len(attrs) for attrs in attributes.values())
    return feature_count
input_layers = get_feature_count(df_assembled, "features")
output_layers = df_assembled.select("attack_index").distinct().count()
hidden_layers = [input_layers // 2, output_layers]
mlp = MultilayerPerceptronClassifier(
    labelCol="attack_index", 
    featuresCol="features", 
    layers=hidden_layers, 
    maxIter=50
)

In [9]:
# Random Forest
model_rf = rf.fit(df_assembled)
#model_nb = nb.fit(df_assembled)


ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
# logistic regression
model_lr = lr.fit(df_assembled)


In [None]:
# Multilayer Perceptron
model_mlp = mlp.fit(df_assembled)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Avaliador
evaluator = MulticlassClassificationEvaluator(labelCol="attack_index", predictionCol="prediction", metricName="accuracy")

# Avaliar Random Forest
accuracy_rf = evaluator.evaluate(model_rf.transform(df_assembled))
print(f"Random Forest Accuracy: {accuracy_rf}")

# Avaliar Naive Bayes
#accuracy_nb = evaluator.evaluate(model_nb.transform(df_assembled))
#print(f"Naive Bayes Accuracy: {accuracy_nb}")

# Avaliar Logistic Regression
accuracy_lr = evaluator.evaluate(model_lr.transform(df_assembled))
print(f"Logistic Regression Accuracy: {accuracy_lr}")

# Avaliar Multilayer Perceptron
accuracy_mlp = evaluator.evaluate(model_mlp.transform(df_assembled))
print(f"Multilayer Perceptron Accuracy: {accuracy_mlp}")