### Configuración inicial

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from delta import configure_spark_with_delta_pip

# 1. Configuración RIGUROSA de la sesión para Delta
builder = SparkSession.builder \
    .appName("Lab_SECOP_Gold") \
    .master("local[*]") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.shuffle.partitions", "2") \
    .config("spark.driver.memory", "1g")

# Aplicar configuración de Delta
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ceb9a0f3-cfb8-4eec-97be-f8039a1d86ec;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 369ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.0.0 from central in [default]
	io.delta#delta-storage;3.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |  

### Rutas

In [2]:
# Rutas
silver_path = "/app/data/lakehouse/silver/secop"
gold_path = "/app/data/lakehouse/gold/top_deptos"

print(" Leyendo capa Silver...")
df_silver = spark.read.format("delta").load(silver_path)

 Leyendo capa Silver...


                                                                                

### Tabla agregada

In [3]:
# (Inversión por Departamento)

print(" Generando agregaciones para capa Oro...")
df_gold = (df_silver
    .groupBy("departamento")
    .agg(
        F.sum("valor_del_contrato").alias("total_inversion"),
        F.count("valor_del_contrato").alias("numero_contratos")
    )
    .orderBy(F.desc("total_inversion"))
)

# Guardar datos agregados
print(f" Guardando en {gold_path}...")
(df_gold
    .coalesce(1) # Guardamos en un solo archivo pequeño
    .write
    .format("delta")
    .mode("overwrite")
    .save(gold_path))

#  Visualización 
print("\n TOP 10 DEPARTAMENTOS POR INVERSIÓN:")
top_10_deptos = df_gold.limit(10).toPandas()

# Formatear la salida para que sea legible
top_10_deptos['total_inversion'] = top_10_deptos['total_inversion'].apply(lambda x: f"${x:,.2f}")
print(top_10_deptos)


 Generando agregaciones para capa Oro...
 Guardando en /app/data/lakehouse/gold/top_deptos...


26/01/31 01:37:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
26/01/31 01:37:03 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                


 TOP 10 DEPARTAMENTOS POR INVERSIÓN:
                 departamento         total_inversion  numero_contratos
0  Distrito Capital de Bogotá  $44,780,789,950,972.00            105490
1                   Antioquia  $14,851,995,599,214.00             43720
2             Valle del Cauca   $5,832,967,490,462.00             55254
3                Cundinamarca   $3,317,744,170,675.00             18362
4                   Santander   $2,630,923,507,968.00             23424
5                     Bolívar   $2,442,177,166,944.00             13445
6                   Atlántico   $2,348,784,928,907.00             14030
7                      Boyacá   $2,246,674,308,667.00             13949
8                   Risaralda   $1,774,562,069,626.00              6500
9                   Magdalena   $1,709,946,708,349.00             17342
