# Stock Project - Real-Time Data Pipeline


In [None]:
# set default parameters
deployment_stage_parameters = ""

In [None]:
# Use json file saved in DEV lakehouse as workaround to get deployment stage as parameter
import json

config_dict = json.loads(deployment_stage_parameters)

env = config_dict.get("environment", {})
default_params = {
    # Extract individual values safely
    "stage": env.get("Stage", "")
}
stage = default_params["stage"]

In [2]:
import yfinance as yf 
import pandas as pd 
from pyspark.sql.functions import col

# List of tickers you want to download
all_tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]

# Stage-specific selection
if stage == "dev":
    tickers = ["AAPL", "MSFT"]
else: 
    tickers = all_tickers


# Download 1-minute interval data for all tickers for the last trading day
# yfinance returns a multi-index DataFrame when multiple tickers are used
df = yf.download(
    tickers=tickers,
    interval="1m",
    period="1d",
    group_by="ticker"  # Organize data by ticker symbol
)

# Show the first rows
df.head()


StatementMeta(, 4f766846-e2a6-4f80-ba1e-6545083802bd, 19, Finished, Available, Finished)

[                       0%                       ][*******************   40%                       ]  2 of 5 completed[**********************60%****                   ]  3 of 5 completed[**********************80%*************          ]  4 of 5 completed[*********************100%***********************]  5 of 5 completed


Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,TSLA,TSLA,TSLA,TSLA,TSLA,...,MSFT,MSFT,MSFT,MSFT,MSFT,AMZN,AMZN,AMZN,AMZN,AMZN
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2026-01-15 14:30:00+00:00,260.649994,260.820007,260.380005,260.430115,977295,444.2099,444.350891,443.0,443.345001,1169293,...,462.644989,462.842499,460.709991,460.785004,1035959,238.429993,238.585007,237.860001,238.164993,1593819
2026-01-15 14:31:00+00:00,260.434998,261.019989,260.149994,260.820007,205976,443.279999,443.910004,441.399994,441.61499,252729,...,460.785004,461.953308,460.600006,461.649994,161783,238.179993,238.359894,237.679993,237.990997,177434
2026-01-15 14:32:00+00:00,260.779999,260.827789,260.169006,260.552094,75709,441.575195,443.48999,441.520111,443.345001,225842,...,461.649994,462.799988,461.429993,462.315002,90652,237.960007,238.149994,237.664993,238.039993,133794
2026-01-15 14:33:00+00:00,260.559998,260.660004,260.119995,260.348511,83546,443.462585,444.649994,443.170105,444.649994,322281,...,462.269989,462.549988,461.485107,461.670013,52708,238.059998,238.770004,238.059998,238.440002,173085
2026-01-15 14:34:00+00:00,260.345001,260.674988,260.179993,260.619995,78648,444.649994,444.709991,443.109985,443.25,213918,...,461.725006,461.799988,460.720001,461.150085,83932,238.520004,238.539993,237.820007,237.919998,128760


#### Flatten the multi-index DataFrame returned by yfinance

In [3]:
# Flatten the multi-index DataFrame returned by yfinance
# This creates a clean table with columns: ticker, datetime, open, high, low, close, volume
rows = []

# Loop through each ticker and extract its data
for ticker in tickers:
    # Extract the sub-DataFrame for the ticker
    temp = df[ticker].copy()
    temp = temp.reset_index()
    
    # Add the ticker column
    temp["ticker"] = ticker
    
    # Append to the list
    rows.append(temp)

# Combine all tickers into one DataFrame
df_flat = pd.concat(rows, ignore_index=True)

# Show the first rows
df_flat.head()


StatementMeta(, 4f766846-e2a6-4f80-ba1e-6545083802bd, 20, Finished, Available, Finished)

Price,Datetime,Open,High,Low,Close,Volume,ticker
0,2026-01-15 14:30:00+00:00,260.649994,260.820007,260.380005,260.430115,977295,AAPL
1,2026-01-15 14:31:00+00:00,260.434998,261.019989,260.149994,260.820007,205976,AAPL
2,2026-01-15 14:32:00+00:00,260.779999,260.827789,260.169006,260.552094,75709,AAPL
3,2026-01-15 14:33:00+00:00,260.559998,260.660004,260.119995,260.348511,83546,AAPL
4,2026-01-15 14:34:00+00:00,260.345001,260.674988,260.179993,260.619995,78648,AAPL


#### Write the Spark DataFrame to a Delta table in the connected Lakehouse

In [4]:
# Convert pandas DataFrame to Spark DataFrame
df_spark = spark.createDataFrame(df_flat)

from delta.tables import DeltaTable
from pyspark.sql.functions import col

# Ensure correct data types
df_spark = df_spark.withColumn("Datetime", col("Datetime").cast("timestamp"))
df_spark = df_spark.withColumn("ticker", col("ticker").cast("string"))

target_table = "bronze_stock_minutes"

# Check if the table already exists
table_exists = (
    spark.sql("SHOW TABLES")
    .filter("tableName = 'bronze_stock_minutes'")
    .count()
)

if table_exists == 0:
    # First run → create the table
    print("Creating new Delta table 'stock_prices'...")
    df_spark.write.format("delta").saveAsTable(target_table)

else:
    # Table exists → perform MERGE (upsert)
    print("Merging new data into existing Delta table...")

    delta_table = DeltaTable.forName(spark, target_table)

    (
        delta_table.alias("t")
        .merge(
            df_spark.alias("s"),
            "t.ticker = s.ticker AND t.Datetime = s.Datetime"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

print("MERGE completed successfully.")


StatementMeta(, 4f766846-e2a6-4f80-ba1e-6545083802bd, 21, Finished, Available, Finished)

Merging new data into existing Delta table...


MERGE completed successfully.
