In [2]:
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from pathlib import Path
from pandas.api.types import DatetimeTZDtype
import unicodedata, re

# ========================
# Configura RUTAS
# ========================
INDICES_PATH = "/Users/marcomendieta/Documents/TFM/Data_stock/stockindex.csv"  # tu tabla anual de índices
out_dir = Path("/Users/marcomendieta/Documents/TFM/Data_stock/StockData_csv")
out_dir.mkdir(parents=True, exist_ok=True)

start_date = "2020-01-01"
end_date = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")

INCLUDE_ADJ_CLOSE = False

# ========================
# Utilidades
# ========================
def _norm(s: str) -> str:
    s = "".join(c for c in unicodedata.normalize("NFKD", str(s)) if not unicodedata.combining(c))
    s = re.sub(r"[^a-z0-9]+", "_", s.lower()).strip("_")
    return s

def _pick_col(df: pd.DataFrame, candidates) -> str:
    for c in candidates:
        if c in df.columns:
            return c
    raise KeyError(f"No se encontró ninguna de: {candidates}. Disponibles: {list(df.columns)}")

def sanitize_index_symbol(sym: str) -> str:
    """Si empieza por '^' y tiene sufijo (.PA/.AS/etc.), quitamos el sufijo (Yahoo índices con ^ no llevan sufijo)."""
    if not isinstance(sym, str) or not sym:
        return sym
    sym = sym.strip()
    if sym.startswith("^") and "." in sym:
        return sym.split(".", 1)[0]
    return sym

# ========================
# Cargar y normalizar la tabla anual de índices
# ========================
df = pd.read_csv(INDICES_PATH, low_memory=False)
df = df.rename(columns={c: _norm(c) for c in df.columns})

# Columnas esperadas (acepta variantes con/ sin acentos)
ticker_col = _pick_col(df, ["ticker"])
year_col   = _pick_col(df, ["year", "anio"])
country_col= _pick_col(df, ["country", "pais"])
ticket_col = None
for cand in ["ticket", "ticker_yahoo", "yahoo_ticker", "ticker_en_yahoo_finance"]:
    if cand in df.columns:
        ticket_col = cand
        break

# Limpieza básica
df[ticker_col]  = df[ticker_col].astype(str).str.strip()
df[year_col]    = pd.to_numeric(df[year_col], errors="coerce").astype("Int64")
df[country_col] = df[country_col].astype(str).str.strip()
if ticket_col:
    df[ticket_col] = df[ticket_col].astype(str).str.strip()

# Ventana de años
start_y, end_y = pd.to_datetime(start_date).year, pd.to_datetime(end_date).year
df = df.dropna(subset=[ticker_col, year_col]).copy()
df = df[(df[year_col] >= start_y) & (df[year_col] <= end_y)]

# Construir mapping (ticker base -> {years}) y (ticker,year) -> símbolo descarga
years_by_ticker = (
    df.groupby(ticker_col)[year_col]
      .apply(lambda s: sorted(set(int(x) for x in s.dropna().tolist())))
      .to_dict()
)

symbol_by_ticker_year = {}
country_by_ticker = {}

for _, r in df.iterrows():
    base = r[ticker_col]
    yr   = int(r[year_col])
    raw  = r[ticket_col] if ticket_col else base
    dl   = sanitize_index_symbol(raw if raw else base)
    symbol_by_ticker_year[(base, yr)] = dl
    # último país visto para ese ticker (suele ser único)
    country_by_ticker[base] = r[country_col]

# Columnas de salida
OUT_COLS = ["Ticker", "Year", "date", "Open", "High", "Low", "Close", "Volume"]
if INCLUDE_ADJ_CLOSE:
    OUT_COLS = ["Ticker", "Year", "date", "Open", "High", "Low", "Close", "Adj Close", "Volume"]

# ========================
# Descarga por país
# ========================
for pais, sub in df.groupby(country_col):
    all_rows = []

    # Tickers base de ese país
    tickers_base = sorted(set(sub[ticker_col].tolist()))

    for base in tickers_base:
        years = years_by_ticker.get(base, [])
        for y in years:
            dl_symbol = symbol_by_ticker_year.get((base, y), base)
            try:
                start_y_str = f"{y}-01-01"
                end_y_str   = f"{y+1}-01-01"  # exclusivo
                hist = yf.Ticker(dl_symbol).history(start=start_y_str, end=end_y_str, auto_adjust=False)
                if hist is None or hist.empty:
                    continue

                hist = hist.reset_index()
                date_col = "Date" if "Date" in hist.columns else ("Datetime" if "Datetime" in hist.columns else None)
                if date_col is None:
                    continue

                hist = hist.rename(columns={date_col: "date"})
                if isinstance(hist["date"].dtype, DatetimeTZDtype):
                    hist["date"] = hist["date"].dt.tz_convert(None)
                hist["date"] = pd.to_datetime(hist["date"], errors="coerce")
                hist = hist.dropna(subset=["date"])
                if hist.empty:
                    continue

                cols = ["Open", "High", "Low", "Close", "Volume"]
                if INCLUDE_ADJ_CLOSE and "Adj Close" in hist.columns:
                    cols.insert(4, "Adj Close")

                keep_exist = [c for c in cols if c in hist.columns]
                part = hist[["date"] + keep_exist].copy()
                part.insert(0, "Ticker", base)  # usamos el 'ticker' base de tu tabla
                part.insert(1, "Year", y)

                keep_final = [c for c in OUT_COLS if c in part.columns]
                part = part[keep_final].sort_values(["Ticker", "Year", "date"])
                part.drop_duplicates(subset=["Ticker", "date"], keep="last", inplace=True)

                all_rows.append(part)

            except Exception:
                continue

    out = pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame(columns=OUT_COLS)
    if not out.empty:
        out.sort_values(["Ticker", "Year", "date"], inplace=True)
        out.drop_duplicates(subset=["Ticker", "date"], keep="last", inplace=True)

    out.to_csv(out_dir / f"stockindex_{pais.replace(' ', '_')}.csv", index=False)

print("ok")


$^OMXS30: possibly delisted; no price data found  (1d 2020-01-01 -> 2021-01-01)
$^OMXS30: possibly delisted; no price data found  (1d 2021-01-01 -> 2022-01-01)
$^OMXS30: possibly delisted; no price data found  (1d 2022-01-01 -> 2023-01-01)
$^OMXS30: possibly delisted; no price data found  (1d 2023-01-01 -> 2024-01-01)
$^OMXS30: possibly delisted; no price data found  (1d 2024-01-01 -> 2025-01-01)


ok
