In [1]:
# pip install yfinance pandas
import pandas as pd
import yfinance as yf
from pathlib import Path
import time

# --- CONFIG ---
CSV_IN = "/Users/marcomendieta/Documents/TFM/Data_stock/Tickets_componentes_indices.csv"
OUTPUT_CSV = "/Users/marcomendieta/Documents/TFM/Data_stock/estadodecuenta.csv"
FREQ = "annual"   # "annual" o "quarterly"
PAUSE_S = 0.5     # pausa entre peticiones para evitar rate limits


ES_LIST = [
    # Top line to operating result
    "Total Revenue",
    "Cost Of Revenue",
    "Gross Profit",
    "Research Development",
    "Selling General Administrative",
    "Operating Expense",
    "Other Operating Expenses",
    "Operating Income",
    "Total Other Income Expense Net",
    "Ebit",

    # Interest, tax, minority, net income
    "Interest Expense",
    "Income Before Tax",
    "Tax Provision",
    "Minority Interest",
    "Net Income From Continuing Ops",
    "Discontinued Operations",
    "Extraordinary Items",
    "Effect Of Accounting Charges",
    "Other Items",
    "Net Income",
    "Net Income Applicable To Common Shares",

    # Per-share and share counts
    "Basic EPS",
    "Diluted EPS",
    "Basic Average Shares",
    "Diluted Average Shares",

    # Common standardized extras
    "Normalized Income",
    "Depreciation",
    "Depreciation & Amortization",
    "Depreciation Depletion & Amortization",
    "Preferred Dividends",

    # Banking/insurance lines that Yahoo shows for those sectors
    "Net Interest Income",
    "Provision for Loan Losses",
    "Total Premiums Earned",
]

# Pequeño diccionario de sinónimos (variaciones que a veces aparecen en Yahoo)
SYNONYMS = {
    "Selling General Administrative": ["Selling General & Administrative"],
    "Net Income From Continuing Ops": ["Net Income From Continuing Operations"],
    "Research Development": ["Research & Development"],
    "Ebit": ["EBIT"],
    "Tax Provision": ["Provision for Income Taxes"],
}

def _read_tickers(csv_path: str) -> list[str]:
    df = pd.read_csv(csv_path)
    # intenta detectar la columna con los símbolos
    candidates = ["ticker","TICKER","Ticker","symbol","SYMBOL","Symbol"]
    col = None
    for c in candidates:
        if c in df.columns:
            col = c
            break
    if col is None:
        # si no hay coincidencia, usa la primera columna
        col = df.columns[0]
    tickers = (df[col].astype(str).str.strip().dropna().unique().tolist())
    # elimina valores vacíos
    tickers = [t for t in tickers if t and t.lower() != "nan"]
    return tickers

def _pick_latest_column(df: pd.DataFrame):
    # Yahoo suele traer columnas como fechas; elegimos la más reciente
    try:
        cols_dt = pd.to_datetime(df.columns, errors="coerce")
        if cols_dt.notna().any():
            latest_idx = cols_dt.argmax()  # índice del máximo datetime
            return df.columns[latest_idx]
    except Exception:
        pass
    # fallback: primera columna
    return df.columns[0]

def _value_from_index(df: pd.DataFrame, key: str, col) -> float | None:
    if key in df.index:
        return df.loc[key, col]
    # prueba sinónimos
    for alt in SYNONYMS.get(key, []):
        if alt in df.index:
            return df.loc[alt, col]
    return None

def fetch_income_for_ticker(ticker: str, freq: str = "annual") -> dict:
    t = yf.Ticker(ticker)
    if freq == "quarterly":
        df = t.quarterly_income_stmt
    else:
        df = t.income_stmt

    if df is None or df.empty:
        return {"ticker": ticker, "period_end": None, **{k: None for k in ES_LIST}}

    # Asegura tipo numérico donde aplique
    df = df.copy()
    # algunas veces viene transpuesta; aseguramos items en index y periodos en columnas
    if df.shape[0] < df.shape[1] and any(isinstance(c, str) and "-" in c for c in df.index):
        df = df.T
    # elegimos el periodo más reciente
    latest_col = _pick_latest_column(df)

    out = {"ticker": ticker, "period_end": str(latest_col)}
    for key in YF_KEYS:
        val = _value_from_index(df, key, latest_col)
        out[key] = pd.to_numeric(val, errors="coerce") if val is not None else None
    return out

def main():
    tickers = _read_tickers(CSV_IN)
    rows = []
    for i, tk in enumerate(tickers, 1):
        try:
            row = fetch_income_for_ticker(tk, FREQ)
        except Exception as e:
            row = {"ticker": tk, "period_end": None, **{k: None for k in ES_LIST}}
            row["_error"] = str(e)
        rows.append(row)
        time.sleep(PAUSE_S)

    df_out = pd.DataFrame(rows)
    cols = ["ticker", "period_end"] + ES_LIST + ([ "_error"] if "_error" in df_out.columns else [])
    df_out = df_out.reindex(columns=cols)
    Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
    df_out.to_csv(OUTPUT_CSV, index=False)
    print(f"Guardado: {OUTPUT_CSV}")
    return df_out

if __name__ == "__main__":
    df_res = main()


Guardado: /Users/marcomendieta/Documents/TFM/Data_stock/estadodecuenta.csv
