In [25]:
# Función para cargar datos
from utils import load_data

import numpy as np
import pandas as pd

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation \
    import MulticlassClassificationEvaluator
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import BisectingKMeans

In [5]:
# Cargar el conjunto de datos
df = load_data()

./data/datasetMta33featues.csv ya existe.
./data/datasetLeg33featues.csv ya existe.


In [16]:
feature_columns = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns,
                            outputCol='features')
df_features = assembler.transform(df)

In [10]:
# Dividir en datos de entrenamiento y prueba
train_data, test_data = df_features.randomSplit([0.7, 0.3], seed=42)

In [11]:
# Crear un model Random Forest
rf = RandomForestClassifier(featuresCol='features', \
                            labelCol='label', numTrees=100)
rf_model = rf.fit(train_data)

24/11/07 13:42:37 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [12]:
# Hacer predicciones
predictions = rf_model.transform(test_data)

In [13]:
# Evaluar el modelo
evaluator = MulticlassClassificationEvaluator(
    labelCol='label', predictionCol='prediction',
    metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print(f"Exactitud del modelo: {accuracy:.2f}")

Exactitud del modelo: 1.00


In [17]:
# Mostrar la importancia de las características
importances = rf_model.featureImportances
for feature, importance in zip(feature_columns, importances):
    print(f"Característica: {feature}, Importancia: {importance}")

Característica: FinFlagDist, Importancia: 0.05689752091597814
Característica: SynFlagDist, Importancia: 0.0022897447782107426
Característica: RstFlagDist, Importancia: 0.004936931927888532
Característica: PshFlagDist, Importancia: 0.0011662681362519362
Característica: AckFlagDist, Importancia: 0.016141176518975384
Característica: DNSoverIP, Importancia: 0.0015809086472585985
Característica: TCPoverIP, Importancia: 0.0019168731558581718
Característica: UDPoverIP, Importancia: 0.00333098488829091
Característica: MaxLen, Importancia: 0.03664544957070368
Característica: MinLen, Importancia: 0.1333850121927852
Característica: StdDevLen, Importancia: 0.01602198528610949
Característica: AvgLen, Importancia: 0.012806688632856233
Característica: MaxIAT, Importancia: 0.04602456631385576
Característica: MinIAT, Importancia: 0.01086150963580441
Característica: AvgIAT, Importancia: 0.016727659330627857
Característica: AvgWinFlow, Importancia: 0.0012970512543227944
Característica: PktsIOratio, Impor

In [20]:
# Calcular la matriz de correlación de Pearson
matrix = Correlation.corr(df_features, "features", method="pearson").head()[0]

# Convertir la matriz a NumPy
correlation_matrix = matrix.toArray()

# Mostrar la matriz
correlation_df = pd.DataFrame(correlation_matrix,
                              columns=feature_columns,
                              index=feature_columns)
print(correlation_df)

[Stage 36:>                                                       (0 + 11) / 11]

                     FinFlagDist  SynFlagDist  RstFlagDist  PshFlagDist  \
FinFlagDist             1.000000     0.819528    -0.208695     0.635939   
SynFlagDist             0.819528     1.000000    -0.274743     0.774576   
RstFlagDist            -0.208695    -0.274743     1.000000    -0.313686   
PshFlagDist             0.635939     0.774576    -0.313686     1.000000   
AckFlagDist             0.697103     0.812633    -0.286266     0.931966   
DNSoverIP               0.164405     0.223863     0.158831     0.127329   
TCPoverIP              -0.118135    -0.174354    -0.141458    -0.090858   
UDPoverIP               0.127132     0.186744     0.146963     0.100309   
MaxLen                  0.523069     0.666485    -0.443542     0.798156   
MinLen                 -0.273353    -0.249195    -0.010274    -0.206378   
StdDevLen               0.483613     0.597880    -0.459849     0.693197   
AvgLen                  0.367549     0.518749    -0.453969     0.778456   
MaxIAT                  0

                                                                                

In [22]:
# Aplicar Bisecting K-Means
bkm = BisectingKMeans(k=3, featuresCol="features", maxIter=10)
bkm_model = bkm.fit(df_features)

In [23]:
# Obtener los clústeres predichos
predictions = bkm_model.transform(df_features)

In [24]:
# Mostrar los centroides
centroids = bkm_model.clusterCenters()
print("Centroides de Bisecting K-Means:")
for centroid in centroids:
    print(centroid)

Centroides de Bisecting K-Means:
[-0.92501888 -1.16518197  0.78483797 -1.23505881 -1.21864178 -0.14365254
  0.1481422  -0.15659998 -1.36697886  0.39830942 -1.24333437 -1.1845914
 -0.90671267  0.26010852 -0.55550061  1.11386582  0.56563037 -0.278611
 -1.28665161  0.61123431 -0.80024711 -0.78801167  0.38613036 -0.40671604
 -0.67923379 -1.28340398 -0.10818519  1.07463359 -0.18912304 -0.18823397
  0.48674221 -0.96608398 -0.72420006]
[-0.04150767  0.06253622 -0.55758637  0.23028954  0.16481764 -0.15046359
  0.15148392 -0.15444425  0.48383887  0.02889848  0.48568317  0.50047628
  0.06887954 -0.07753451  0.14237785 -0.48538148 -0.19440119  0.16318841
  0.35895299 -0.11888823  0.21519664  0.17186402 -0.12502051  0.09405166
 -0.24140569  0.28064391 -0.03915467 -0.48433791 -0.18132896 -0.19259215
 -0.00748343  0.04250504  0.14430466]
[ 1.12086908  1.18930372  0.19802466  0.94269315  1.05114002  0.45321899
 -0.46024651  0.47549751  0.59962452 -0.5040787   0.45698775  0.36222846
  0.88623551 -0.14