In [0]:
# Databricks notebook source
# Descarga GDELT GKG v2.1 (.gkg.csv.zip), descomprime y guarda como .csv.gz en landing.
# Luego, tu Auto Loader (bronze_gdelt_autoloader.py) toma esos .csv.gz y los carga a bronze.gdelt_gkg_raw.

import datetime as dt
from datetime import timedelta
# Si los widgets vienen vacíos, fijar ventana por defecto
now_utc = dt.datetime.utcnow()
default_start = (now_utc - timedelta(days=1)).date().isoformat()
end_date_s  = now_utc.date().isoformat()

try:
    last_ts = spark.sql("select max(date_gkg) from ptd_dev.bronze.gdelt_gkg_raw").collect()[0][0]
except Exception as e:
    last_ts = None

if last_ts:
    start_date_s = (last_ts - timedelta(hours=1)).date().isoformat()
else:
    start_date_s = default_start

dbutils.widgets.text("only_missing", "true")      # si true, salta archivos que ya existen

import datetime as dt
from datetime import timezone, timedelta
import io, gzip, os
import requests
from zipfile import ZipFile

start_date = dt.datetime.fromisoformat(start_date_s)
end_date_s = end_date_s.strip()
end_date = (dt.datetime.fromisoformat(end_date_s) if end_date_s else dt.datetime.utcnow()).replace(tzinfo=None)

only_missing = dbutils.widgets.get("only_missing").lower() == "true"

# Convenciones DBFS/Volumes:
# - Para ESCRIBIR archivos con Python use /dbfs prefix
# - Para LEER con Spark, use la ruta sin /dbfs
dbfs_landing = "/dbfs/Volumes/ptd_dev/bronze/raw/gdelt_gkg"

dbutils.fs.mkdirs(dbfs_landing)

BASE = "http://data.gdeltproject.org/gdeltv2"

def iter_times_15min(start: dt.datetime, end: dt.datetime):
    # GDELT publica en cortes de 15m: 00,15,30,45
    cur = start.replace(minute=(start.minute//15)*15, second=0, microsecond=0)
    while cur <= end:
        yield cur
        cur += timedelta(minutes=15)

def gdelt_gkg_url(ts: dt.datetime) -> str:
    tsstr = ts.strftime("%Y%m%d%H%M%S")
    return f"{BASE}/{tsstr}.gkg.csv.zip", tsstr

downloaded, skipped, errors = 0, 0, []

for ts in iter_times_15min(start_date, end_date):
    url, tsstr = gdelt_gkg_url(ts)
    out_gz = f"{dbfs_landing}/{tsstr}.gkg.csv.gz"

    if only_missing and os.path.exists(out_gz):
        skipped += 1
        continue

    try:
        r = requests.get(url, timeout=60)
        if r.status_code != 200:
            # No todos los cortes existen; omitir 404
            if r.status_code != 404:
                errors.append((tsstr, f"HTTP {r.status_code}"))
            continue

        with ZipFile(io.BytesIO(r.content)) as zf:
            # Debe contener 1 CSV. Tomamos el primero .csv
            members = [n for n in zf.namelist() if n.lower().endswith(".csv")]
            if not members:
                errors.append((tsstr, "ZIP sin CSV"))
                continue

            with zf.open(members[0]) as csv_in:
                raw = csv_in.read()
                
        out_gz_tmp = f"/dbfs/tmp/{tsstr}.gkg.csv.gz"
        with gzip.open(out_gz_tmp, "wb") as gz_out:
            gz_out.write(raw)
            
        dbutils.fs.mv(f"dbfs:/tmp/{tsstr}.gkg.csv.gz", f"dbfs:/Volumes/ptd_dev/bronze/raw/gdelt_gkg/{tsstr}.gkg.csv.gz")
        downloaded += 1
    except Exception as e:
        errors.append((tsstr, str(e)))

print(f"Descargados: {downloaded} | Saltados: {skipped} | Errores: {len(errors)}")
if errors:
    display(spark.createDataFrame(errors, schema="ts string, error string"))