In [0]:
import requests
import re
import json
from datetime import datetime

CATALOG = "ptd_dev"
BASE = "https://api.gdeltproject.org/api/v2/doc/doc"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.bronze.gdelt_tonechart (
    company_name STRING,
    date STRING,
    bin LONG,
    count LONG
)
USING DELTA
""")

In [0]:
def gdelt_gkg_url(company_alias: list[str], ts: str) -> str:
    # Construye la parte de la query con los alias
    if len(company_alias) > 1: 
        query = " OR ".join([f'"{alias}"' for alias in company_alias])
        query = f"({query})"
    else:
        query = f'"{company_alias[0]}"'
    return f"{BASE}?query={query}&ts={ts}&mode=tonechart&format=json"

rows, downloaded, skipped, errors = [], 0, 0, []

companies = spark.table(f"{CATALOG}.bronze.universe_sp100_snapshot").filter("active_flag = 1").collect()

for c in companies:
    try:
       if (
            spark.catalog.tableExists(f"{CATALOG}.bronze.gdelt_tonechart") and
            spark.table(f"{CATALOG}.bronze.gdelt_tonechart").filter(
                not (
                    "company_name" == c.company_name
                    and "date" == datetime.now().strftime("%Y-%m-%d")
                )
            )
        ):
            aliases = []
            if " - " in c.company_alias:
                aliases = c.company_alias.split(" - ")
            else:
                aliases.append(c.company_alias)
            url = gdelt_gkg_url(aliases, "24hs")

            response = requests.get(url)
            response.raise_for_status()
            date = datetime.now().strftime("%Y-%m-%d")

            # Preprocesar el texto para eliminar el campo toparts y restos de title/url/texto suelto
            cleaned_text = re.sub(r',?\s*"toparts"\s*:\s*\[.*?\]', '', response.text, flags=re.DOTALL)
            cleaned_text = re.sub(r',?\s*\{\s*"url"\s*:\s*"[^"]*"\s*,\s*"title"\s*:\s*"[^"]*"\s*\}', '', cleaned_text)
            cleaned_text = re.sub(r',\s*,', ',', cleaned_text)
            cleaned_text = re.sub(r',\s*([\]\}])', r'\1', cleaned_text)
            cleaned_text = re.sub(r'("count"\s*:\s*\d+)[^},]*', r'\1', cleaned_text)
            cleaned_text = re.sub(r'("count"\s*:\s*\d+)\s*bin', r'\1},{"bin', cleaned_text)
            cleaned_text = re.sub(r',?\s*"title"\s*:\s*"[^"]*"', '', cleaned_text)
            cleaned_text = re.sub(r',?\s*"url"\s*:\s*"[^"]*"', '', cleaned_text)
            data = json.loads(cleaned_text)

            rows = []
            for bin_info in data.get('tonechart', []):
                bin_value = bin_info.get('bin')
                count_value = bin_info.get('count')
                rows.append({
                    "company_name": c.company_name,
                    "date": date,
                    "bin": bin_value,
                    "count": count_value
                })


            downloaded+=1
            
    except Exception as e:
        errors.append((c.company_name, str(e)))
        skipped += 1


In [0]:
errors

In [0]:
cleaned_text

In [0]:
cleaned_text

In [0]:
cleaned_text

In [0]:
cleaned_text

In [0]:
cleaned_text

In [0]:
errors

In [0]:
errors

In [0]:
errors

In [0]:
errors

In [0]:
if(rows != []):
    df = spark.createDataFrame(rows)
    df.display()
    df.write.mode("append").saveAsTable(f"{CATALOG}.bronze.gdelt_tonechart")

print(f"Descargados: {downloaded} | Saltados: {skipped} | Errores: {len(errors)}")
if errors:
    display(spark.createDataFrame(errors, schema="ts string, error string"))