In [0]:
import requests
import re
import json
from datetime import datetime
import pytz
from pyspark.sql import functions as F
import urllib.parse

CATALOG = "ptd_dev"
BASE = "https://api.gdeltproject.org/api/v2/doc/doc"

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.bronze.gdelt_tonechart (
    company_name STRING,
    date STRING,
    bin LONG,
    count LONG,
    ingestion_ts TIMESTAMP
)
USING DELTA
""")

eastern = pytz.timezone("US/Eastern")
dbutils.widgets.text("date", datetime.now(eastern).strftime("%Y-%m-%d"))

In [0]:
def cleaned_gdelt_json_response(response):
    text = response.text
    matches = re.findall(r'\{\s*"bin"\s*:\s*(-?\d+),\s*"count"\s*:\s*(\d+)', text)
    cleaned = {'tonechart': []}
    for bin_val, count_val in matches:
        cleaned['tonechart'].append({'bin': int(bin_val), 'count': int(count_val)})
    return json.dumps(cleaned)

def gdelt_api_url(company_alias: list[str], date: str) -> str:
    dt_obj = datetime.strptime(date, "%Y-%m-%d")
    start = dt_obj.replace(hour=0, minute=0, second=0).strftime("%Y%m%d%H%M%S")
    end = dt_obj.replace(hour=21, minute=45, second=0).strftime("%Y%m%d%H%M%S")
    if len(company_alias) > 1: 
        query = " OR ".join([f'"{alias}"' for alias in company_alias])
        query = f"({query})"
    else:
        query = f'"{company_alias[0]}"'
    return f"{BASE}?query={urllib.parse.quote_plus(query)}&STARTDATETIME={start}&ENDDATETIME={end}&mode=tonechart&format=json"

In [0]:
date = dbutils.widgets.get("date")

rows, downloaded, skipped, errors = [], 0, 0, []

companies = spark.table(f"{CATALOG}.bronze.universe_sp100_snapshot").filter("active_flag = 1").collect()

for c in companies:
    try:
        df = spark.table(f"{CATALOG}.bronze.gdelt_tonechart").filter(
            (F.col("company_name") == c.company_name) &
            (F.col("date") == date)
        )
        if df.count() == 0:
            aliases = []
            if " - " in c.company_alias:
                aliases = c.company_alias.split(" - ")
            else:
                aliases.append(c.company_alias)
            url = gdelt_api_url(aliases, date)

            response = requests.get(url)
            response.raise_for_status()
            date_str = date

            cleaned_text = cleaned_gdelt_json_response(response)
            data = json.loads(cleaned_text)
            for bin_info in data.get('tonechart', []):
                bin_value = bin_info.get('bin')
                count_value = bin_info.get('count')
                rows.append({
                    "company_name": c.company_name,
                    "date": date_str,
                    "bin": bin_value,
                    "count": count_value
                })
            if len(rows) > 0:
                downloaded+=1
            else:
                errors.append((c.company_name, "No data"))
                skipped += 1
        else:
            skipped += 1   
    except Exception as e:
        errors.append((c.company_name, str(e)))
        skipped += 1

In [0]:
if(rows != []):
    df = spark.createDataFrame(rows)
    df = df.withColumn("ingestion_ts", F.current_timestamp())
    df.write.mode("append").saveAsTable(f"{CATALOG}.bronze.gdelt_tonechart")
    
print(f"Descargados: {downloaded} | Saltados: {skipped} | Errores: {len(errors)}")
if errors:
    display(spark.createDataFrame(errors))