In [0]:
# Databricks notebook source
# Auto Loader para GDELT GKG v2.1 desde un drop folder (tú subes CSV/CSV.GZ)
# Widgets
dbutils.widgets.text("catalog", "ptd_dev")
dbutils.widgets.text("schema_bronze", "bronze")
dbutils.widgets.text("source_path", "/Volumes/ptd_dev/bronze/raw/gdelt_gkg/")  # carpeta con archivos .csv o .csv.gz
dbutils.widgets.text("checkpoint_path", "/Volumes/ptd_dev/bronze/checkpoints/gdelt_gkg_raw")

catalog       = dbutils.widgets.get("catalog")
schema_bronze = dbutils.widgets.get("schema_bronze")
source_path   = dbutils.widgets.get("source_path")
ckpt_path     = dbutils.widgets.get("checkpoint_path")

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema_bronze}")

from pyspark.sql import functions as F

# Esquema mínimo (usaremos inferSchema para tolerar variaciones)
df = (spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "csv")
      .option("cloudFiles.schemaLocation", "/mnt/bronze/gdelt_gkg/_schemas")  # Ruta para guardar el esquema
      .option("cloudFiles.inferColumnTypes", "true")
      .option("header", "true")
      .option("cleanSource", "archive")
      .option("cloudFiles.includeExistingFiles", "true")
      .load(source_path))

# Normalizamos nombres de columnas relevantes si existen
# Campos típicos: "DATE","DocumentIdentifier","SourceCommonName","V2Persons","V2Organizations","V2Tone","V2Themes","V2Locations","V2AllNames"
for c in df.columns:
    df = df.withColumnRenamed(c, c.strip())

df = (df
      .withColumn("ingestion_ts", F.current_timestamp())
      .withColumn("date_gkg", F.to_date(F.to_timestamp(F.col("DATE").cast("string"), "yyyyMMddHHmmss")))
     )

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog}.bronze.gdelt_gkg_raw
USING DELTA
PARTITIONED BY (date_gkg)
AS SELECT * FROM (SELECT NULL as DATE) WHERE 1=0
""")

(query := (df.writeStream
    .format("delta")
    .option("checkpointLocation", ckpt_path)
    .outputMode("append")
    .toTable(f"{catalog}.bronze.gdelt_gkg_raw")
)).awaitTermination(False)