In [0]:
# Databricks notebook source
dbutils.widgets.text("catalog", "ptd_dev")

catalog       = dbutils.widgets.get("catalog")
schema_bronze = "bronze"
schema_silver = "silver"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema_silver}")

from pyspark.sql import functions as F

# Tablas de entrada
gkg = spark.table(f"{catalog}.{schema_bronze}.gdelt_gkg_raw")
aliases = spark.table(f"{catalog}.bronze.ticker_aliases") \
               .select(F.upper(F.col("alias")).alias("alias"), F.upper(F.col("ticker")).alias("canonical_ticker"))

# Campos esperados y normalizaciones
g = (gkg
     .withColumn("date", F.to_date(F.to_timestamp(F.col("DATE").cast("string"), "yyyyMMddHHmmss")))
     .withColumn("source", F.col("SourceCommonName"))
     .withColumn("tone_val", F.split(F.col("V2Tone"), ",").getItem(0).cast("double"))
     .withColumn("V2Organizations", F.upper(F.col("V2Organizations")))
     .withColumn("V2Persons", F.upper(F.col("V2Persons")))
     .withColumn("V2AllNames", F.upper(F.col("V2AllNames")))
    )

# Extraer entidades (alias) desde Persons, Organizations y AllNames (delimitador ';')
def split_nonempty(col):
    return F.filter(F.transform(F.split(F.coalesce(col, F.lit("")), r";"), lambda x: F.trim(x)), lambda x: x != "")

ents = (g
    .select("date","source","tone_val",
            split_nonempty(F.col("V2Organizations")).alias("orgs"),
            split_nonempty(F.col("V2Persons")).alias("persons"),
            split_nonempty(F.col("V2AllNames")).alias("allnames"))
    .withColumn("entity", F.explode(F.array_union(F.array_union(F.col("orgs"), F.col("persons")), F.col("allnames"))))
    .select("date","source","tone_val", F.upper(F.col("entity")).alias("alias"))
    .filter(F.col("alias").isNotNull() & (F.col("alias") != ""))
)

# Match por alias
news_entities = (ents.join(aliases, on="alias", how="inner")
                      .select("date","alias","canonical_ticker","tone_val","source")
                      .withColumn("ingestion_ts", F.current_timestamp()))

news_entities.write.mode("overwrite").option("overwriteSchema","true").saveAsTable(f"{catalog}.{schema_silver}.news_entities")

# Aggregation por ticker-date
news_daily_agg = (news_entities.groupBy("canonical_ticker","date")
                  .agg(F.count("*").alias("news_count"),
                       F.avg("tone_val").alias("tone_avg"),
                       F.stddev_samp("tone_val").alias("tone_std"))
                  .withColumnRenamed("canonical_ticker","ticker")
                  .withColumn("ingestion_ts", F.current_timestamp()))

news_daily_agg.write.mode("overwrite").option("overwriteSchema","true").saveAsTable(f"{catalog}.{schema_silver}.news_daily_agg")