In [1]:
import os
import sys
import subprocess

#Ubicaci√≥n de Spark OJO no pyspark
os.environ['SPARK_HOME'] = "/home/hadoop/spark"

#jars nativos
sys.path.insert(0, "/home/hadoop/spark/python")
sys.path.insert(0, "/home/hadoop/spark/python/lib/py4j-0.10.9.7-src.zip") 
sys.path.insert(0, "/home/hadoop/spark/python/lib/pyspark.zip")

#Importar Spark con jars
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.types import DoubleType

#Usar yarn
spark = SparkSession.builder \
    .appName("Limpieza_Manual_Jars") \
    .master("yarn") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "2") \
    .config("spark.yarn.executor.memoryOverhead", "512m") \
    .getOrCreate()

print("‚úÖ Spark iniciado correctamente usando los JARs del sistema.")

#Definir rutas
CARPETA_ORIGEN = "/trafico"
CARPETA_DESTINO = "/trafico_clean"

#columnas de archivo de 3.7gb
cols_drop_base = [
    "Flow ID", "Src IP", "Dst IP", "Src Port", "Timestamp", 
    "Bwd PSH Flags", "Bwd URG Flags", "Fwd URG Flags",
    "CWE Flag Count", "Fwd Byts/b Avg", "Bwd Byts/b Avg",
    "Fwd Pkts/b Avg", "Bwd Pkts/b Avg", "Fwd Blk Rate Avg", "Bwd Blk Rate Avg"
]

#ubicar archivo
cmd = f"hdfs dfs -ls {CARPETA_ORIGEN} | grep .csv | awk '{{print $8}}'"
try:
    archivos_b = subprocess.check_output(cmd, shell=True)
    lista_archivos = archivos_b.decode("utf-8").strip().split("\n")
    # Filtramos l√≠neas vac√≠as
    lista_archivos = [x for x in lista_archivos if x]
except:
    lista_archivos = []

if not lista_archivos:
    print("‚ö†Ô∏è No se encontraron archivos .csv en /trafico")
else:
    archivo_actual = lista_archivos[0] # Tomamos el √∫nico archivo
    filename = archivo_actual.split("/")[-1]
    
    print(f"--> Procesando archivo √∫nico: {filename}")
    
    try:
        #memoria para que no colapse 
        df_temp = spark.read.option("header", "true").option("inferSchema", "false").csv(archivo_actual)
        
        #limpiar
        df_temp = df_temp.filter(col("Label") != "Label")
        
        cols_existentes = df_temp.columns
        cols_a_borrar = [c for c in cols_drop_base if c in cols_existentes]
        df_temp = df_temp.drop(*cols_a_borrar)
        
        #pasar cols a num√©ricas
        numeric_cols = [c for c in df_temp.columns if c != "Label"]
        for c in numeric_cols:
            df_temp = df_temp.withColumn(c, col(c).cast(DoubleType()))
            
        #clases binarias
        df_temp = df_temp.withColumn("Label_Binary", when(col("Label") == "Benign", 0.0).otherwise(1.0))
        
        #guardar en formato parquet
        print("    Guardando en Parquet (esto puede tardar unos minutos)...")
        df_temp.write.mode("append").parquet(CARPETA_DESTINO)
        print("    ‚úÖ Guardado exitoso.")
        
        #borrar csv original
        cmd_rm = f"hdfs dfs -rm -skipTrash {archivo_actual}"
        subprocess.check_call(cmd_rm, shell=True)
        print(f"    üóëÔ∏è Archivo CSV eliminado para liberar espacio.")
        
    except Exception as e:
        print(f"    ‚ùå ERROR: {str(e)}")
        spark.stop()

print("\n--- Proceso finalizado ---")

25/11/20 17:05:53 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
25/11/20 17:05:53 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/20 17:05:54 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
25/11/20 17:05:54 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'sp

‚úÖ Spark iniciado correctamente usando los JARs del sistema.
--> Procesando archivo √∫nico: Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv


                                                                                

    Guardando en Parquet (esto puede tardar unos minutos)...


25/11/20 17:07:03 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/11/20 17:07:18 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 2 for reason Container marked as failed: container_1763647897811_0017_01_000003 on host: ED3. Exit status: -100. Diagnostics: Container released on a *lost* node.
25/11/20 17:07:18 ERROR YarnScheduler: Lost executor 2 on ED3: Container marked as failed: container_1763647897811_0017_01_000003 on host: ED3. Exit status: -100. Diagnostics: Container released on a *lost* node.
                                                                                

    ‚úÖ Guardado exitoso.
Deleted /trafico/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv
    üóëÔ∏è Archivo CSV eliminado para liberar espacio.

--- Proceso finalizado ---


In [2]:
# --- VERIFICACI√ìN FINAL DEL DATASET COMPLETO ---
print("Leyendo dataset completo desde Parquet...")
df_final = spark.read.parquet("/trafico_clean")

# 1. Verificar Tipos de Dato
print("\n--- 1. Muestra del Esquema (Primeras 5 columnas + Labels) ---")
# Imprimimos solo una parte para no llenar la pantalla
df_final.select(df_final.columns[:5] + ["Label", "Label_Binary"]).printSchema()

# 2. Verificar Transformaci√≥n de Labels
print("\n--- 2. Verificaci√≥n de Etiquetas ---")
df_final.groupBy("Label", "Label_Binary").count().show(truncate=False)
# Deber√≠as ver:
# Benign   | 0.0 | XXXXX
# FTP-B... | 1.0 | XXXXX
# SSH-B... | 1.0 | XXXXX
# etc...

# 3. Chequeo R√°pido de Nulos
print("\n--- 3. Buscando valores Nulos en columnas num√©ricas ---")
# Hacemos un conteo de nulos por columna. 
# (Esto puede tardar un minuto porque revisa todo el dataset)
from pyspark.sql.functions import isnan, when, count, col

# Seleccionamos algunas columnas clave para probar (o todas si tienes paciencia)
columnas_a_probar = df_final.columns[:10] # Probamos las primeras 10 columnas

df_final.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in columnas_a_probar]).show()

print(f"Total de registros en el dataset final: {df_final.count():,}")

Leyendo dataset completo desde Parquet...


                                                                                


--- 1. Muestra del Esquema (Primeras 5 columnas + Labels) ---
root
 |-- Dst Port: double (nullable = true)
 |-- Protocol: double (nullable = true)
 |-- Flow Duration: double (nullable = true)
 |-- Tot Fwd Pkts: double (nullable = true)
 |-- Tot Bwd Pkts: double (nullable = true)
 |-- Label: string (nullable = true)
 |-- Label_Binary: double (nullable = true)


--- 2. Verificaci√≥n de Etiquetas ---


                                                                                

+----------------------+------------+-------+
|Label                 |Label_Binary|count  |
+----------------------+------------+-------+
|Benign                |0.0         |7372557|
|DDoS attacks-LOIC-HTTP|1.0         |576191 |
+----------------------+------------+-------+


--- 3. Buscando valores Nulos en columnas num√©ricas ---


                                                                                

+--------+--------+-------------+------------+------------+---------------+---------------+---------------+---------------+----------------+
|Dst Port|Protocol|Flow Duration|Tot Fwd Pkts|Tot Bwd Pkts|TotLen Fwd Pkts|TotLen Bwd Pkts|Fwd Pkt Len Max|Fwd Pkt Len Min|Fwd Pkt Len Mean|
+--------+--------+-------------+------------+------------+---------------+---------------+---------------+---------------+----------------+
|       0|       0|            0|           0|           0|              0|              0|              0|              0|               0|
+--------+--------+-------------+------------+------------+---------------+---------------+---------------+---------------+----------------+





Total de registros en el dataset final: 7,948,748


                                                                                

In [2]:
import os
import sys
import subprocess

#rutas ahora resto de archivos
os.environ['SPARK_HOME'] = "/home/hadoop/spark"

# rutas
sys.path.insert(0, "/home/hadoop/spark/python")
sys.path.insert(0, "/home/hadoop/spark/python/lib/py4j-0.10.9.7-src.zip") 
sys.path.insert(0, "/home/hadoop/spark/python/lib/pyspark.zip")



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan
from pyspark.sql.types import DoubleType

spark = SparkSession.builder \
    .appName("Procesamiento_Masivo_Seguro") \
    .master("yarn") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.cores", "2") \
    .config("spark.yarn.executor.memoryOverhead", "512m") \
    .getOrCreate()

# Rutas
CARPETA_ORIGEN = "/trafico"
CARPETA_DESTINO = "/trafico_clean"

#cols eliminar 
cols_drop_base = [
    "Bwd PSH Flags", "Bwd URG Flags", "Fwd Pkts/b Avg", "Bwd Pkts/b Avg",
    "Fwd Byts/b Avg", "Bwd Byts/b Avg", "Fwd Blk Rate Avg", "Bwd Blk Rate Avg",
    "Fwd URG Flags", "CWE Flag Count", "FIN Flag Cnt", "Timestamp",
    "Flow ID", "Src IP", "Dst IP", "Src Port", "Protocol"
]

#buscar archivos
print("Buscando archivos en /trafico...")
try:
    cmd = f"hdfs dfs -ls {CARPETA_ORIGEN} | grep .csv | awk '{{print $8}}'"
    lista_archivos = subprocess.check_output(cmd, shell=True).decode("utf-8").strip().split("\n")
    lista_archivos = [x for x in lista_archivos if x]
except:
    lista_archivos = []

if not lista_archivos:
    print("‚ö†Ô∏è No se encontraron archivos nuevos.")
else:
    print(f"Se encontraron {len(lista_archivos)} archivos.")

    for archivo in lista_archivos:
        filename = archivo.split("/")[-1]
        print(f"\n--> Procesando: {filename}")
        
        try:
            #leer
            df_temp = spark.read.option("header", "true").option("inferSchema", "false").csv(archivo)
            
            #buscar columnas
            cant_cols = len(df_temp.columns)
            print(f"    üìä Columnas iniciales: {cant_cols}")
            
            #limpiar
            df_temp = df_temp.filter(col("Label") != "Label")
            cols_existentes = df_temp.columns
            cols_a_borrar = [c for c in cols_drop_base if c in cols_existentes]
            df_temp = df_temp.drop(*cols_a_borrar)
            
            #conteo
            for c in df_temp.columns:
                if c != "Label":
                    df_temp = df_temp.withColumn(c, col(c).cast(DoubleType()))
            
            #label binario
            df_temp = df_temp.withColumn("Label_Binary", when(col("Label") == "Benign", 0.0).otherwise(1.0))
            
            #guardar y borrar
            df_temp.write.mode("append").parquet(CARPETA_DESTINO)
            subprocess.check_call(f"hdfs dfs -rm -skipTrash {archivo}", shell=True)
            print(f"    ‚úÖ Procesado y eliminado.")
            
        except Exception as e:
            print(f"    ‚ùå Error: {str(e)}")

print("\n--- Procesamiento Terminado ---")

25/11/20 18:20:36 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
25/11/20 18:20:36 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/20 18:20:37 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
25/11/20 18:20:37 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'sp

Buscando archivos en /trafico...
Se encontraron 9 archivos.

--> Procesando: Friday-02-03-2018_TrafficForML_CICFlowMeter.csv


                                                                                

    üìä Columnas iniciales: 80


25/11/20 18:21:38 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Deleted /trafico/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Friday-16-02-2018_TrafficForML_CICFlowMeter.csv


                                                                                

    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Friday-16-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Friday-23-02-2018_TrafficForML_CICFlowMeter.csv
    üìä Columnas iniciales: 80


25/11/20 18:23:29 ERROR TransportClient: Failed to send RPC RPC 6486607694425462776 to /10.6.101.125:40190: io.netty.channel.StacklessClosedChannelException
io.netty.channel.StacklessClosedChannelException
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source)
25/11/20 18:23:29 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to get executor loss reason for executor id 1 at RPC address 10.6.101.125:52224, but got no response. Marking as agent lost.
java.io.IOException: Failed to send RPC RPC 6486607694425462776 to /10.6.101.125:40190: io.netty.channel.StacklessClosedChannelException
	at org.apache.spark.network.client.TransportClient$RpcChannelListener.handleFailure(TransportClient.java:395)
	at org.apache.spark.network.client.TransportClient$StdChannelListener.operationComplete(TransportClient.java:372)
	at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:590)
	at io.netty.util.concurrent.DefaultPromise.no

Deleted /trafico/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv


                                                                                

    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv
    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv
    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv
    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--> Procesando: Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv
    üìä Columnas iniciales: 80


                                                                                

Deleted /trafico/Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv
    ‚úÖ Procesado y eliminado.

--- Procesamiento Terminado ---


In [4]:
# Cargar el dataset unido
df_final = spark.read.parquet("/trafico_clean")

#registros totales
total_filas = df_final.count()
print(f"‚úÖ TOTAL DE REGISTROS RECUPERADOS: {total_filas:,}")

#verificar clases
print("\nüìä Distribuci√≥n de Etiquetas (Debe haber Benignos y Ataques):")
df_final.groupBy("Label_Binary").count().show()

#Verificar contenido 
print("\nüëÄ Primeras 5 filas:")
df_final.select("Label", "Label_Binary", df_final.columns[0], df_final.columns[1]).show(5)

#conteo final columnas
print(f"\nüìè Cantidad de columnas: {len(df_final.columns)}")

                                                                                

‚úÖ TOTAL DE REGISTROS RECUPERADOS: 16,232,943

üìä Distribuci√≥n de Etiquetas (Debe haber Benignos y Ataques):


                                                                                

+------------+--------+
|Label_Binary|   count|
+------------+--------+
|         0.0|13484708|
|         1.0| 2748235|
+------------+--------+


üëÄ Primeras 5 filas:
+------+------------+--------+-------------+
| Label|Label_Binary|Dst Port|Flow Duration|
+------+------------+--------+-------------+
|Benign|         0.0|    80.0|  8.1828496E7|
|Benign|         0.0|    80.0|  8.1758552E7|
|Benign|         0.0|    80.0|  8.1688858E7|
|Benign|         0.0|    80.0|  8.1618526E7|
|Benign|         0.0|    80.0|  8.1548276E7|
+------+------------+--------+-------------+
only showing top 5 rows


üìè Cantidad de columnas: 68
