In [None]:
# Create default deployment stage parameter
deployment_stage_parameters = "DEV" 

In [None]:
# Use json file saved in DEV lakehouse as workaround to get deployment stage as parameter

import json

config_dict = json.loads(deployment_stage_parameters)

env = config_dict.get("environment", {})
default_params = {
    # Extract individual values safely
    "stage": env.get("Stage", "")
}
stage = default_params["stage"]
sql = f"SELECT '{stage}' AS Stage"

mssparkutils.notebook.exit(sql)

In [2]:
import yfinance as yf 
import pandas as pd 
from pyspark.sql.functions import col

# Full list of tickers to load in test/prod
all_tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]

# Stage-specific configuration for ticker selection and date range
if stage == "dev":
    # Small subset for fast development
    tickers = ["AAPL", "MSFT"]
    start_date = "2023-01-01"   # only 1 year of data
elif stage == "test":
    # Representative dataset for validation
    tickers = all_tickers
    start_date = "2021-01-01"   # last 3 years
else:  # prod
    # Full dataset for production workloads
    tickers = all_tickers
    start_date = "2011-01-01"   # full history

# End date is always yesterday (market data is delayed)
end_date = (pd.Timestamp.today() - pd.Timedelta(days=1)).strftime("%Y-%m-%d")

rows = []

for ticker in tickers:

    # Download daily historical data for the selected ticker
    df_t = yf.download(
        ticker,
        interval="1d",
        start=start_date,
        end=end_date,
        auto_adjust=True,
        group_by="column"
    )

    # Skip tickers with no available data
    if df_t.empty:
        print(f"⚠️ No data for {ticker}, skipping.")
        continue

    # Reset index so "Date" becomes a column
    df_t = df_t.reset_index()

    # Remove multi-index columns if present
    if isinstance(df_t.columns, pd.MultiIndex):
        df_t.columns = [c[0] for c in df_t.columns]

    # Rename date column for consistency
    df_t = df_t.rename(columns={"Date": "Datetime"})

    # Add ticker column for identification
    df_t["ticker"] = ticker

    rows.append(df_t)

# Combine all tickers into a single flat DataFrame
df_flat = pd.concat(rows, ignore_index=True)

print("Columns in df_flat:", df_flat.columns)
df_flat.head()


StatementMeta(, 9b30bfdf-daba-4c56-9311-ab99ff1f00c0, 6, Finished, Available, Finished)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed



Unnamed: 0,Datetime,Close,High,Low,Open,Volume,ticker
0,2011-01-03,9.884148,9.904842,9.74229,9.766283,445138400,AAPL
1,2011-01-04,9.93573,9.972019,9.841558,9.970219,309080800,AAPL
2,2011-01-05,10.017008,10.027205,9.882048,9.883548,255519600,AAPL
3,2011-01-06,10.00891,10.054496,9.984017,10.038601,300428800,AAPL
4,2011-01-07,10.080592,10.08749,9.954029,10.016711,311931200,AAPL


In [3]:
df_spark = spark.createDataFrame(df_flat)
df_spark = df_spark.withColumn("Datetime", col("Datetime").cast("timestamp"))

spark.sql("DROP TABLE IF EXISTS bronze_stock_daily")
df_spark.write.format("delta").saveAsTable("bronze_stock_daily")


StatementMeta(, 9b30bfdf-daba-4c56-9311-ab99ff1f00c0, 7, Finished, Available, Finished)