In [0]:
# Databricks notebook source
# Requiere: yfinance instalado en el cluster
# Widgets
dbutils.widgets.text("catalog", "ptd_dev")
dbutils.widgets.text("schema_bronze", "bronze")
dbutils.widgets.text("extra_tickers", "SPY,^VIX")  # comma-separated
dbutils.widgets.text("vendor_map_table", "")       # opcional: ptd_dev.bronze.ticker_vendor_map

from datetime import timedelta, date
import datetime as dt
now_utc = dt.datetime.utcnow()

maxd = None
try:
    row = spark.sql("select max(date) as maxd from ptd_dev.bronze.prices_raw").collect()[0]
except Exception as e:
    row = None
    
if maxd:
    start_date = maxd.isoformat()
else:
    start_date = (now_utc - timedelta(days=1)).date().isoformat()

# end_date: si tu job corre después del cierre, hoy; si corre antes, usa ayer

end_date = now_utc.date().isoformat()  # o (now_utc.date() - timedelta(days=1)).isoformat()

catalog         = dbutils.widgets.get("catalog")
schema_bronze   = dbutils.widgets.get("schema_bronze")
extra_tickers   = [t.strip().upper() for t in dbutils.widgets.get("extra_tickers").split(",") if t.strip()]
vendor_map_tbl  = dbutils.widgets.get("vendor_map_table").strip()

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema_bronze}")

In [0]:
%pip install yfinance pandas

In [0]:
from pyspark.sql import functions as F, types as T
import pandas as pd
import time

try:
    import yfinance as yf
except Exception as e:
    raise RuntimeError("Instalá yfinance en el cluster (Libraries o %pip install yfinance).") from e

# 1) Lista de tickers desde el universo + extras
tickers_df = spark.sql(f"SELECT DISTINCT UPPER(ticker) AS ticker FROM {catalog}.bronze.universe_sp100_snapshot")
tickers = [r["ticker"] for r in tickers_df.collect()]
for t in extra_tickers:
    if t not in tickers:
        tickers.append(t)

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../common")))
from vendor_mapping import to_yfinance_symbol
yf_symbols = {t: to_yfinance_symbol(t) for t in tickers + ["SPY","^VIX"]}

def to_yf(sym: str) -> str:
    return yf_symbols.get(sym, sym)



In [0]:
# Define las columnas esperadas
cols = ["ticker", "date", "open", "high", "low", "close", "adj_close", "volume", "source"]

rows = []
failures = []
for i, tk in enumerate(tickers, 1):
    yf_sym = to_yf(tk)
    try:
        hist = yf.download(yf_sym, start=start_date, end=end_date, progress=False, auto_adjust=False, interval="1d")
        if hist is None or hist.empty:
            failures.append((tk, "empty"))
            continue
        if isinstance(hist.columns, pd.MultiIndex):
            hist = hist.xs(yf_sym, axis=1, level=1)
        hist.reset_index(inplace=True)
        hist.rename(columns={"Date": "date", "Open": "open", "High": "high", "Low": "low",
                             "Close": "close", "Adj Close": "adj_close", "Volume": "volume"}, inplace=True)
        hist["ticker"] = tk
        hist["source"] = "yfinance"
        # Reindexa para asegurar columnas y orden
        hist = hist.reindex(columns=cols)
        rows.append(hist)
    except Exception as e:
        failures.append((tk, str(e)))
    time.sleep(0.2)

if not rows:
    raise RuntimeError("No se descargaron precios. Revisá conexión o yfinance.")

pdf = pd.concat(rows, ignore_index=True)
pdf.columns = [col[0] if isinstance(col, tuple) else col for col in pdf.columns]
pdf = pdf[pdf["date"].notna() & (pdf["date"] != "nan") & (pdf["date"] != "NaT")]
pdf["date"] = pdf["date"].astype(str)
sdf = spark.createDataFrame(pdf.assign(ingestion_ts=pd.Timestamp.utcnow()))

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog}.bronze.prices_raw (
  ticker STRING,
  date   DATE,
  open   DOUBLE,
  high   DOUBLE,
  low    DOUBLE,
  close  DOUBLE,
  adj_close DOUBLE,
  volume   DOUBLE,
  source   STRING,
  ingestion_ts TIMESTAMP
)
USING DELTA
PARTITIONED BY (date)
""")

sdf.createOrReplaceTempView("tmp_prices_in")
spark.sql(f"""
MERGE INTO {catalog}.bronze.prices_raw AS t
USING (
  SELECT DISTINCT ticker, CAST(date AS DATE) AS date, open, high, low, close, adj_close, volume, source, ingestion_ts
  FROM tmp_prices_in
) s
ON t.ticker = s.ticker AND t.date = s.date
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")

print(f"Descargas fallidas: {len(failures)}")
if failures:
    display(spark.createDataFrame(failures, schema="ticker string, error string"))