In [2]:
# Create default deployment stage parameter
deployment_stage_parameters = "" 

StatementMeta(, ad7d4642-6efa-4dc5-8e20-b7070f7e6107, 5, Finished, Available, Finished)

In [None]:
# Use json file saved in DEV lakehouse as workaround to get deployment stage as parameter

import json

config_dict = json.loads(deployment_stage_parameters)

env = config_dict.get("environment", {})
default_params = {
    # Extract individual values safely
    "stage": env.get("Stage", "")
}
stage = default_params["stage"]

StatementMeta(, , -1, Cancelled, , Cancelled)

In [5]:
import yfinance as yf 
import pandas as pd 
from pyspark.sql.functions import col

# Full list of tickers to load in test/prod
all_tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]

# Stage-specific configuration for ticker selection and date range
if stage == "DEV":
    # Small subset for fast development
    tickers = ["AAPL", "MSFT"]
    start_date = "2023-01-01"   # only 1 year of data
elif stage == "test":
    # Representative dataset for validation
    tickers = all_tickers
    start_date = "2021-01-01"   # last 3 years
else:  # prod
    # Full dataset for production workloads
    tickers = all_tickers
    start_date = "2011-01-01"   # full history

# End date is always yesterday (market data is delayed)
end_date = pd.Timestamp.today() - pd.Timedelta(days=1)

rows = []

for ticker in tickers:

    # Download daily historical data for the selected ticker
    df_t = yf.download(
        ticker,
        interval="1d",
        start=start_date,
        end=end_date,
        auto_adjust=True,
        group_by="column"
    )

    # Skip tickers with no available data
    if df_t.empty:
        print(f"⚠️ No data for {ticker}, skipping.")
        continue

    # Reset index so "Date" becomes a column
    df_t = df_t.reset_index()

    # Remove multi-index columns if present
    if isinstance(df_t.columns, pd.MultiIndex):
        df_t.columns = [c[0] for c in df_t.columns]

    # Rename date column for consistency
    df_t = df_t.rename(columns={"Date": "Datetime"})

    # Add ticker column for identification
    df_t["ticker"] = ticker

    rows.append(df_t)

# Combine all tickers into a single flat DataFrame
df_flat = pd.concat(rows, ignore_index=True)

print("Columns in df_flat:", df_flat.columns)
df_flat.sort_values(by="Datetime", ascending=False).head()



StatementMeta(, ad7d4642-6efa-4dc5-8e20-b7070f7e6107, 8, Finished, Available, Finished)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker
18904,2026-01-14,439.200012,443.910004,434.220001,442.809998,57259500,TSLA
3780,2026-01-14,259.959991,261.820007,256.709991,259.48999,40019400,AAPL
11342,2026-01-14,335.839996,336.519989,330.480011,335.059998,28525600,GOOGL
15123,2026-01-14,236.649994,241.279999,236.220001,241.149994,41410600,AMZN
7561,2026-01-14,459.380005,468.200012,457.170013,466.459991,28184300,MSFT


In [6]:
df_spark = spark.createDataFrame(df_flat)
df_spark = df_spark.withColumn("Datetime", col("Datetime").cast("timestamp"))

spark.sql("DROP TABLE IF EXISTS bronze_stock_daily")
df_spark.write.format("delta").saveAsTable("bronze_stock_daily")


StatementMeta(, ad7d4642-6efa-4dc5-8e20-b7070f7e6107, 9, Finished, Available, Finished)