### Balance geneal

In [None]:
import re
import time
import pandas as pd
import yfinance as yf
from datetime import datetime
from difflib import get_close_matches

MIN_YEAR = 2020
MAX_YEAR = datetime.now().year
df_tickers = pd.read_csv("/Users/marcomendieta/Documents/TFM/Data_stock/Tickets_componentes_indices.csv", sep=None, engine="python")
outfile = "/Users/marcomendieta/Documents/TFM/Data_stock/Balance/balances_generales_prueba.csv"

balance_general_keys = [
#ACTIVO
#corto plazo
    "CashAndCashEquivalents",
    "CashFinancial",
    "CashCashEquivalentsAndShortTermInvestments",
    "TradingSecurities",
    "OtherShortTermInvestments",
    "AccountsReceivable",
    "OtherReceivables",
    "LoansHeldForSale",
    "OtherCurrentAssets",
#largo plazo
    "LongTermInvestments",
    "InvestmentsAndAdvances",
    "Properties",
    "MachineryFurnitureEquipment",
    "ConstructionInProgress",
    "OtherIntangibleAssets",
    "Goodwill",
    "GoodwillAndOtherIntangibleAssets",
    "OtherNonCurrentAssets",
    "DeferredTaxAssets",
    "NetTangibleAssets",
    "TotalAssets",

#PASIVO
#corto plazp
    "AccountsPayable",
    "CurrentNotesPayable",
    "CurrentDebt",
    "CurrentDebtAndCapitalLeaseObligation",
    "CurrentAccruedExpenses",
    "PayablesAndAccruedExpenses",
    "IncomeTaxPayable",
    "TaxesPayable",
    "SecuritiesSoldUnderRepurchaseAgreements",
    "FederalFundsPurchasedAndSecuritiesSoldUnderRepurchaseAgreements",
    "LiabilitiesOfDiscontinuedOperations",
#largo plazo
    "LongTermDebt",
    "LongTermDebtAndCapitalLeaseObligation",
    "NonCurrentDeferredLiabilities",
    "NonCurrentDeferredTaxesLiabilities",
#total de pasivos
    "TotalDebt",
    "NetDebt",
    "TotalLiabilitiesNetMinorityInterest",

    #CAPITAL
    "StockholdersEquity",
    "ShareIssued",
    "CommonStock",
    "CapitalStock",
    "CommonStockEquity",
    "OrdinarySharesNumber",
    "TreasuryStock",
    "AdditionalPaidInCapital",
    "InvestedCapital",
    "TotalCapitalization",
    "RetainedEarnings",
    "UnrealizedGainLoss",
    "GainsLossesNotAffectingRetainedEarnings",
    "ForeignCurrencyTranslationAdjustments",
    "OtherEquityAdjustments",
    "TotalEquityGrossMinorityInterest",
    "RevaluationReserve",
    "TangibleBookValue",
]

def norm(s: str) -> str:
    return re.sub(r'[^a-z0-9]', '', str(s).lower())

def best_match(target_norm: str, candidates_norm: dict) -> str | None:
    if target_norm in candidates_norm:
        return target_norm
    m = get_close_matches(target_norm, list(candidates_norm.keys()), n=1, cutoff=0.84)
    if m:
        return m[0]
    for c in candidates_norm:
        if target_norm in c or c in target_norm:
            return c
    return None

def to_millions(x):
    try:
        val = float(x)
        return round(val / 1_000_000, 2)
    except (TypeError, ValueError):
        return None

def col_year(col) -> int | None:
    try:
        if hasattr(col, "year"):
            return int(col.year)
        s = str(col)
        m = re.match(r'^\D*(\d{4})', s)
        if m:
            return int(m.group(1))
    except Exception:
        pass
    return None

def pick_col_for_year(df: pd.DataFrame, year: int):
    if df is None or df.empty:
        return None
    candidates = []
    for c in df.columns:
        cy = col_year(c)
        if cy == year:
            candidates.append(c)
    if not candidates:
        return None
    try:
        candidates_sorted = sorted(
            candidates, key=lambda c: pd.to_datetime(c, errors="coerce"), reverse=True
        )
        return candidates_sorted[0]
    except Exception:
        return candidates[0]

tickers = (
    df_tickers["ticker_market"]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

target_years = list(range(MIN_YEAR, MAX_YEAR + 1))
rows = []

for ti in tickers:
    retry = 3
    while retry > 0:
        try:
            t = yf.Ticker(ti)
            bs_annual = t.balance_sheet
            bs_quarter = t.quarterly_balance_sheet
            if bs_annual is not None and not bs_annual.empty:
                norm_index_map = {norm(idx): idx for idx in bs_annual.index}
            elif bs_quarter is not None and not bs_quarter.empty:
                norm_index_map = {norm(idx): idx for idx in bs_quarter.index}
            else:
                norm_index_map = {}
            for yr in target_years:
                col = pick_col_for_year(bs_annual, yr)
                source_df = "annual"
                if col is None:
                    col = pick_col_for_year(bs_quarter, yr)
                    source_df = "quarterly" if col is not None else None
                data_balance = {"Ticker": ti, "Year": yr}
                for key in balance_general_keys:
                    nk = norm(key)
                    match_norm = best_match(nk, norm_index_map) if norm_index_map else None
                    value = None
                    if match_norm is not None and source_df is not None:
                        yahoo_label = norm_index_map[match_norm]
                        try:
                            if source_df == "annual":
                                raw = bs_annual.loc[yahoo_label, col]
                            else:
                                raw = bs_quarter.loc[yahoo_label, col]
                            value = to_millions(raw)
                        except Exception:
                            value = None
                    data_balance[key] = value
                rows.append(data_balance)
            break
        except Exception as e:
            msg = str(e).lower()
            if "too many requests" in msg or "rate limit" in msg:
                retry -= 1
                time.sleep(8)
                if retry == 0:
                    for yr in target_years:
                        empty_row = {"Ticker": ti, "Year": yr}
                        for key in balance_general_keys:
                            empty_row[key] = None
                        rows.append(empty_row)
            else:
                for yr in target_years:
                    empty_row = {"Ticker": ti, "Year": yr}
                    for key in balance_general_keys:
                        empty_row[key] = None
                    rows.append(empty_row)
                break
    time.sleep(0.5)

df_balance_general = pd.DataFrame(rows)
cols = ["Ticker", "Year"] + balance_general_keys
for k in balance_general_keys:
    if k not in df_balance_general.columns:
        df_balance_general[k] = None
df_balance_general = df_balance_general[cols].sort_values(["Ticker", "Year"])

df_balance_general.to_csv(outfile, index=False)
print('ok')

In [1]:
import re
import time
import pandas as pd
import yfinance as yf
from datetime import datetime
from difflib import get_close_matches

MIN_YEAR = 2020
MAX_YEAR = datetime.now().year
df_tickers = pd.read_csv("/Users/marcomendieta/Documents/TFM/Data_stock/Tickets.csv", sep=None, engine="python")
outfile = "/Users/marcomendieta/Documents/TFM/Data_stock/Balance/balances_generales.csv"

def norm(s: str) -> str:
    return re.sub(r'[^a-z0-9]', '', str(s).lower())

def best_match(target_norm: str, candidates_norm: dict) -> str | None:
    if target_norm in candidates_norm:
        return target_norm
    m = get_close_matches(target_norm, list(candidates_norm.keys()), n=1, cutoff=0.84)
    if m:
        return m[0]
    for c in candidates_norm:
        if target_norm in c or c in target_norm:
            return c
    return None

def to_millions(x):
    try:
        val = float(x)
        return round(val / 1_000_000, 2)
    except (TypeError, ValueError):
        return None

def col_year(col) -> int | None:
    try:
        if hasattr(col, "year"):
            return int(col.year)
        s = str(col)
        m = re.match(r'^\D*(\d{4})', s)
        if m:
            return int(m.group(1))
    except Exception:
        pass
    return None

def pick_col_for_year(df: pd.DataFrame, year: int):
    if df is None or df.empty:
        return None
    candidates = []
    for c in df.columns:
        cy = col_year(c)
        if cy == year:
            candidates.append(c)
    if not candidates:
        return None
    try:
        candidates_sorted = sorted(
            candidates, key=lambda c: pd.to_datetime(c, errors="coerce"), reverse=True
        )
        return candidates_sorted[0]
    except Exception:
        return candidates[0]

tickers = (
    df_tickers["ticker_market"]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

balance_general_keys = [
#ACTIVO
#corto plazo
    "CashAndCashEquivalents",
    "CashFinancial",
    "CashCashEquivalentsAndShortTermInvestments",
    "TradingSecurities",
    "OtherShortTermInvestments",
    "AccountsReceivable",
    "OtherReceivables",
    "LoansHeldForSale",
    "OtherCurrentAssets",
#largo plazo
    "LongTermInvestments",
    "InvestmentsAndAdvances",
    "Properties",
    "MachineryFurnitureEquipment",
    "ConstructionInProgress",
    "OtherIntangibleAssets",
    "Goodwill",
    "GoodwillAndOtherIntangibleAssets",
    "OtherNonCurrentAssets",
    "DeferredTaxAssets",
    "NetTangibleAssets",
    "TotalAssets",

#PASIVO
#corto plazp
    "AccountsPayable",
    "CurrentNotesPayable",
    "CurrentDebt",
    "CurrentDebtAndCapitalLeaseObligation",
    "CurrentAccruedExpenses",
    "PayablesAndAccruedExpenses",
    "IncomeTaxPayable",
    "TaxesPayable",
    "SecuritiesSoldUnderRepurchaseAgreements",
    "FederalFundsPurchasedAndSecuritiesSoldUnderRepurchaseAgreements",
    "LiabilitiesOfDiscontinuedOperations",
#largo plazo
    "LongTermDebt",
    "LongTermDebtAndCapitalLeaseObligation",
    "NonCurrentDeferredLiabilities",
    "NonCurrentDeferredTaxesLiabilities",
#total de pasivos
    "TotalDebt",
    "NetDebt",
    "TotalLiabilitiesNetMinorityInterest",

    #CAPITAL
    "StockholdersEquity",
    "ShareIssued",
    "CommonStock",
    "CapitalStock",
    "CommonStockEquity",
    "OrdinarySharesNumber",
    "TreasuryStock",
    "AdditionalPaidInCapital",
    "InvestedCapital",
    "TotalCapitalization",
    "RetainedEarnings",
    "UnrealizedGainLoss",
    "GainsLossesNotAffectingRetainedEarnings",
    "ForeignCurrencyTranslationAdjustments",
    "OtherEquityAdjustments",
    "TotalEquityGrossMinorityInterest",
    "RevaluationReserve",
    "TangibleBookValue",
]

target_years = list(range(MIN_YEAR, MAX_YEAR + 1))
rows = []

for ti in tickers:
    retry = 3
    while retry > 0:
        try:
            t = yf.Ticker(ti)
            bs_annual = t.balance_sheet
            bs_quarter = t.quarterly_balance_sheet
            if bs_annual is not None and not bs_annual.empty:
                norm_index_map = {norm(idx): idx for idx in bs_annual.index}
            elif bs_quarter is not None and not bs_quarter.empty:
                norm_index_map = {norm(idx): idx for idx in bs_quarter.index}
            else:
                norm_index_map = {}
            for yr in target_years:
                col = pick_col_for_year(bs_annual, yr)
                source_df = "annual"
                if col is None:
                    col = pick_col_for_year(bs_quarter, yr)
                    source_df = "quarterly" if col is not None else None
                data_balance = {"Ticker": ti, "Year": yr}
                for key in balance_general_keys:
                    nk = norm(key)
                    match_norm = best_match(nk, norm_index_map) if norm_index_map else None
                    value = None
                    if match_norm is not None and source_df is not None:
                        yahoo_label = norm_index_map[match_norm]
                        try:
                            if source_df == "annual":
                                raw = bs_annual.loc[yahoo_label, col]
                            else:
                                raw = bs_quarter.loc[yahoo_label, col]
                            value = to_millions(raw)
                        except Exception:
                            value = None
                    data_balance[key] = value
                rows.append(data_balance)
            break
        except Exception as e:
            msg = str(e).lower()
            if "too many requests" in msg or "rate limit" in msg:
                retry -= 1
                time.sleep(8)
                if retry == 0:
                    for yr in target_years:
                        empty_row = {"Ticker": ti, "Year": yr}
                        for key in balance_general_keys:
                            empty_row[key] = None
                        rows.append(empty_row)
            else:
                for yr in target_years:
                    empty_row = {"Ticker": ti, "Year": yr}
                    for key in balance_general_keys:
                        empty_row[key] = None
                    rows.append(empty_row)
                break
    time.sleep(0.5)

df_balance_general = pd.DataFrame(rows)
cols = ["Ticker", "Year"] + balance_general_keys
for k in balance_general_keys:
    if k not in df_balance_general.columns:
        df_balance_general[k] = None
df_balance_general = df_balance_general[cols].sort_values(["Ticker", "Year"])

df_balance_general.to_csv(outfile, index=False)
print('ok')

ok
