In [0]:
%pip install yfinance

In [0]:
import yfinance as yf
from pyspark.sql import Row
from pyspark.sql.functions import current_date

In [0]:
companies = [
    ("ARM LIMITED","02557590","ARM"),
    ("HSBC HOLDINGS PLC","00617987","HSBA.L"),
    ("ASTRAZENECA PLC","02723534","AZN.L"),
    ("SHELL PLC","04366849","SHEL.L"),
    ("LINDE","BR025083","LIN"),
    ("RIO TINTO PLC","00719885","RIO.L"),
    ("UNILEVER PLC","00041424","ULVR.L"),
    ("ROLLS-ROYCE HOLDINGS PLC","07524813","RR.L"),
    ("BRITISH AMERICAN TOBACCO P.L.C.","03407696","BATS.L"),
    ("ARM HOLDINGS PLC","11299879","ARM"),
    ("GSK PLC","03888792","GSK.L"),
    ("BP P.L.C.","00102498","BP.L"),
    ("BARCLAYS PLC","00048839","BARC.L"),
    ("LLOYDS BANKING GROUP PLC","SC095000","LLOY.L"),
    ("NATIONAL GRID PLC","04031152","NG.L"),
    ("BAE SYSTEMS PLC","01470151","BA.L"),
    ("AON GLOBAL LIMITED","07876075","AON"),
    ("NATWEST GROUP PLC","SC045551","NWG.L"),
    ("RELX PLC","00077536","REL.L"),
    ("STANDARD CHARTERED PLC","00966425","STAN.L"),
    ("LONDON STOCK EXCHANGE GROUP PLC","05369106","LSEG.L")
]

In [0]:
metadata_rows = []

for name, number, ticker in companies:
    stock = yf.Ticker(ticker)
    info = stock.info

    metadata_rows.append(Row(
        company_name=name,
        company_number=number,
        ticker=ticker,
        symbol=info.get("symbol"),
        short_name=info.get("shortName"),
        long_name=info.get("longName"),
        industry=info.get("industry"),
        sector=info.get("sector"),
        country=info.get("country"),
        exchange=info.get("exchange"),
        market_cap=info.get("marketCap"),
        website=info.get("website")
    ))

df_company_metadata = spark.createDataFrame(metadata_rows) \
    .withColumn("ingestion_date", current_date())

df_company_metadata.display()

In [0]:
df_company_metadata.write.mode("overwrite") \
    .csv("/Volumes/companies-data/bronze/ingestion_raw/yfinance/company_details/")

In [0]:
price_rows = []

for _, number, ticker in companies:
    stock = yf.Ticker(ticker)
    history = stock.history(period="1y").reset_index()

    for _, r in history.iterrows():
        price_rows.append(Row(
            company_number=number,
            ticker=ticker,
            date=str(r["Date"].date()),
            open=float(r["Open"]),
            high=float(r["High"]),
            low=float(r["Low"]),
            close=float(r["Close"]),
            adj_close=float(r.get("Adj Close", r["Close"])),
            volume=int(r["Volume"])
        ))

df_prices = spark.createDataFrame(price_rows) \
    .withColumn("ingestion_date", current_date())

df_prices.display()

In [0]:
df_prices.write.mode("overwrite") \
    .csv("/Volumes/companies-data/bronze/ingestion_raw/yfinance/trading_data/")

In [0]:
import yfinance as yf
import pandas as pd
from pyspark.sql import Row
from pyspark.sql.functions import current_date
from pyspark.sql.types import (
    StructType, StructField, StringType, DoubleType, DateType
)

# -----------------------------
# Helper functions
# -----------------------------

def safe_get(df, field, q):
    if df is None or df.empty:
        return None
    try:
        val = df.loc[field, q]
        if pd.isna(val):
            return None
        return float(val)
    except Exception:
        return None


def safe_get_any(df, fields, q):
    for f in fields:
        val = safe_get(df, f, q)
        if val is not None:
            return val
    return None


# -----------------------------
# Build rows
# -----------------------------

fundamental_rows = []

for name, number, ticker in companies:
    stock = yf.Ticker(ticker)

    financials = stock.quarterly_financials
    balance = stock.quarterly_balance_sheet
    cashflow = stock.quarterly_cashflow

    quarters = set()
    for df in [financials, balance, cashflow]:
        if df is not None and not df.empty:
            quarters.update(df.columns)

    for q in sorted(quarters):
        quarter_date = pd.to_datetime(q).date()

        # Cash flow components (used twice)
        ocf = safe_get_any(
            cashflow,
            ["Total Cash From Operating Activities"],
            q
        )
        capex = safe_get_any(
            cashflow,
            ["Capital Expenditures"],
            q
        )

        row = {
            "company_name": str(name),
            "company_number": str(number),
            "ticker": str(ticker),
            "quarter_end_date": quarter_date,

            # ---- Income statement ----
            "total_revenue": safe_get_any(
                financials, ["Total Revenue"], q
            ),
            "gross_profit": safe_get_any(
                financials, ["Gross Profit"], q
            ),
            "operating_income": safe_get_any(
                financials, ["Operating Income", "Operating Income or Loss"], q
            ),
            "net_income": safe_get_any(
                financials, ["Net Income"], q
            ),
            "ebitda": safe_get_any(
                financials, ["EBITDA"], q
            ),

            # ---- Balance sheet ----
            "total_assets": safe_get_any(
                balance, ["Total Assets"], q
            ),
            "total_liabilities": safe_get_any(
                balance, ["Total Liab"], q
            ),
            "cash": safe_get_any(
                balance, ["Cash And Cash Equivalents", "Cash"], q
            ),
            "long_term_debt": safe_get_any(
                balance,
                ["Long Term Debt", "Long Term Debt And Capital Lease Obligation"],
                q
            ),

            # ---- Cash flow ----
            "operating_cash_flow": ocf,
            "capital_expenditure": capex,
            "free_cash_flow": (
                ocf - abs(capex) if ocf is not None and capex is not None else None
            ),
        }

        fundamental_rows.append(Row(**row))


# -----------------------------
# Spark schema (EXPLICIT)
# -----------------------------

schema = StructType([
    StructField("company_name", StringType(), True),
    StructField("company_number", StringType(), True),
    StructField("ticker", StringType(), True),
    StructField("quarter_end_date", DateType(), True),

    StructField("total_revenue", DoubleType(), True),
    StructField("gross_profit", DoubleType(), True),
    StructField("operating_income", DoubleType(), True),
    StructField("net_income", DoubleType(), True),
    StructField("ebitda", DoubleType(), True),

    StructField("total_assets", DoubleType(), True),
    StructField("total_liabilities", DoubleType(), True),
    StructField("cash", DoubleType(), True),
    StructField("long_term_debt", DoubleType(), True),

    StructField("operating_cash_flow", DoubleType(), True),
    StructField("capital_expenditure", DoubleType(), True),
    StructField("free_cash_flow", DoubleType(), True),
])


# -----------------------------
# Create DataFrame
# -----------------------------

df_quarterly_fundamentals = (
    spark.createDataFrame(fundamental_rows, schema=schema)
         .withColumn("ingestion_date", current_date())
)

df_quarterly_fundamentals.display().limit(10)


In [0]:
df_quarterly_fundamentals.write.mode("overwrite") \
    .csv("/Volumes/companies-data/bronze/ingestion_raw/yfinance/fundamentals_data/")

In [0]:
display(df_quarterly_fundamentals.limit(10))