In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

os.environ['SPARK_HOME'] = "/home/hadoop/spark"
sys.path.insert(0, "/home/hadoop/spark/python")
sys.path.insert(0, "/home/hadoop/spark/python/lib/py4j-0.10.9.7-src.zip") 
sys.path.insert(0, "/home/hadoop/spark/python/lib/pyspark.zip")

spark = SparkSession.builder \
    .appName("Analisis_Completo_Modelo_Guardado") \
    .master("yarn") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.yarn.executor.memoryOverhead", "1024m") \
    .config("spark.network.timeout", "600s") \
    .getOrCreate()

print("‚úÖ Spark iniciado.")

25/11/25 16:01:51 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
25/11/25 16:01:51 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 16:01:51 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
25/11/25 16:01:51 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'sp

‚úÖ Spark iniciado.


In [None]:
ruta_modelo = "/modelos/rf_all_features_v1"
print(f"üìÇ Cargando modelo desde: {ruta_modelo} ...")

try:
    model_cargado = PipelineModel.load(ruta_modelo)
    print("‚úÖ Modelo cargado exitosamente.")
except Exception as e:
    print(f"‚ùå Error cargando el modelo: {str(e)}")
    spark.stop()
    sys.exit(1)

print("\nüîç Extrayendo conocimientos del modelo...")

assembler_stage = model_cargado.stages[1] 
rf_stage = model_cargado.stages[-1]

# Extraer nombres y valores
nombres_columnas = assembler_stage.getInputCols()
importancias = rf_stage.featureImportances.toArray()

# Crear DataFrame de Pandas para visualizar
df_importancia = pd.DataFrame({
    'Columna': nombres_columnas,
    'Importancia': importancias
}).sort_values(by='Importancia', ascending=False)

# Mostrar Tabla Top 20
print("\nüèÜ TOP 20 VARIABLES M√ÅS IMPORTANTES (Lo que aprendi√≥ el modelo):")
print(df_importancia.head(20))

# Graficar
plt.figure(figsize=(12, 8))
sns.barplot(x='Importancia', y='Columna', data=df_importancia.head(20), palette='viridis')
plt.title('Importancia de Caracter√≠sticas - Modelo RF Recuperado')
plt.xlabel('Importancia (Gini)')
plt.tight_layout()
plt.show()

In [None]:

ruta_test = "/trafico_train_mini"
print(f"\nüß™ Evaluando rendimiento con datos de prueba: {ruta_test}")

# Cargar datos de prueba
df_test = spark.read.parquet(ruta_test)

# Generar Predicciones
# (El modelo ya tiene el StringIndexer dentro, as√≠ que acepta la columna 'Label' cruda)
print("    Generando predicciones...")
predictions = model_cargado.transform(df_test)

# Optimizaci√≥n de memoria (Seleccionar solo lo necesario para evaluar)
results = predictions.select("label_index", "prediction")
results.cache()

# Calcular M√©tricas
acc_eval = MulticlassClassificationEvaluator(labelCol="label_index", metricName="accuracy")
f1_eval = MulticlassClassificationEvaluator(labelCol="label_index", metricName="f1")
prec_eval = MulticlassClassificationEvaluator(labelCol="label_index", metricName="weightedPrecision")
rec_eval = MulticlassClassificationEvaluator(labelCol="label_index", metricName="weightedRecall")

print("\nüìä RESULTADOS FINALES DE EVALUACI√ìN:")
print("-" * 40)
print(f"üéØ Accuracy:  {acc_eval.evaluate(results):.2%}")
print(f"‚öñÔ∏è F1-Score:  {f1_eval.evaluate(results):.2%}")
print(f"‚úÖ Precision: {prec_eval.evaluate(results):.2%}")
print(f"üîç Recall:    {rec_eval.evaluate(results):.2%}")
print("-" * 40)

# Limpieza final
results.unpersist()
print("\nüèÅ Proceso finalizado.")