In [2]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
import argparse, os, sys, json, hashlib
from pathlib import Path

print("Step 1: Loading environment variables...")
load_dotenv()
data_path = os.getenv('DATA_PATH', '../data/raw/')
stock_symbol = os.getenv('STOCK_SYMBOL', 'AAPL')
print(f"DEBUG: Data path is set to: {data_path}")
print(f"DEBUG: Script is running from: {os.getcwd()}")

print("Step 2: Creating data directory...")
os.makedirs(data_path, exist_ok=True)
print("SUCCESS: Directory created or already exists.")

top_stocks = ['AAPL', 'GOOGL', 'TSLA', 'MSFT', 'AMZN', 'NVDA', 'META', 'BRK-B', 'JPM', 'V']
end_date = datetime.now().date()
start_date = end_date - timedelta(days=5*365)

print("Step 3: Downloading data...")
historical_data_multi = yf.download(tickers=top_stocks, start=start_date, end=end_date)
print("SUCCESS: Data downloaded.")

if historical_data_multi.empty:
    raise ValueError(f"No historical data found. Check symbol or dates.")
    
print("Step 4: Stacking and cleaning data...")
stacked_df = historical_data_multi.stack(level=1).reset_index().rename(columns={'level_1': 'Symbol'})
print("SUCCESS: Data stacked.")

print("Step 5: Saving data to CSV...")
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
combined_filename = os.path.join(data_path, f"top10_historical_{timestamp}.csv")
print(f"DEBUG: Saving file to: {combined_filename}")
stacked_df.to_csv(combined_filename, index=False)
print("SUCCESS: File saved!")

  historical_data_multi = yf.download(tickers=top_stocks, start=start_date, end=end_date)
[*********************100%***********************]  10 of 10 completed

Step 1: Loading environment variables...
DEBUG: Data path is set to: ../data/raw/
DEBUG: Script is running from: /Users/syonamehra/Documents/NYU Classes/MFE Bootcamp/Machine Learning Bootcamp In Person/bootcamp_syona_mehra/project/src
Step 2: Creating data directory...
SUCCESS: Directory created or already exists.
Step 3: Downloading data...
SUCCESS: Data downloaded.
Step 4: Stacking and cleaning data...
SUCCESS: Data stacked.
Step 5: Saving data to CSV...
DEBUG: Saving file to: ../data/raw/top10_historical_2025-08-24_14-18-17.csv
SUCCESS: File saved!



  stacked_df = historical_data_multi.stack(level=1).reset_index().rename(columns={'level_1': 'Symbol'})


In [3]:
# Save SEPARATE historical files
for symbol in top_stocks:
    single_hist = historical_data_multi.xs(symbol, level=1, axis=1)  # Extract per symbol (xs: cross-section)
    # The .xs() method stands for cross-section. It's a powerful tool in pandas for selecting a specific slice of data from a DataFrame with a MultiIndex.
    # symbol: This is the value you want to select. In the first loop, this will be 'AAPL'.
    # level=1: This tells xs() to look for the value ('AAPL') in the second level of your MultiIndex columns. In your case, this is the 'Ticker' level.
    # axis=1: This specifies that you are slicing across the columns, not the rows.
    # The Result: This line extracts all the data for a single stock (e.g., all the 'Open', 'High', 'Low', 'Close', and 'Volume' columns) for a specific ticker and puts it into a new, smaller DataFrame named single_hist.
    if single_hist.empty:
        print(f"Warning: Empty data for {symbol}")
        continue
    hist_filename = os.path.join(data_path, f"{symbol.lower()}_historical_{timestamp}.csv")
    single_hist.to_csv(hist_filename)

# Print success
print("Separate files saved for each stock")

Separate files saved for each stock


Original Data format 
Columns on top of columns 

                 Open                          Close                        
Symbol          AAPL    GOOGL    TSLA    ...     AAPL    GOOGL    TSLA  ...
Date                                                                        
2020-08-01    110.0    1470.0   1450.0          112.0    1480.0   1460.0
2020-08-02    111.0    1485.0   1465.0          113.0    1490.0   1470.0
...            ...       ...     ...            ...       ...     ...



✅ Now, what .stack(level=1) does:
stacked_df = historical_data_multi.stack(level=1)
📌 level=1 means squash the second header row (tickers) down into a new column.
So instead of having “ticker names” as part of the column headers, we push them down as values inside the DataFrame.
Result looks like this (simplified):
Date        Symbol   Open    High    Low    Close    Adj Close   Volume
2020-08-01   AAPL    110.0   ...     ...    112.0     112.0      100000
2020-08-01   GOOGL  1470.0   ...     ...   1480.0    1480.0       50000
2020-08-01   TSLA   1450.0   ...     ...   1460.0    1460.0       80000
2020-08-02   AAPL    111.0   ...     ...    113.0     113.0      120000
2020-08-02   GOOGL  1485.0   ...     ...   1490.0    1490.0       60000
...
👉 See what happened? Instead of one row with many tickers side by side, you now have multiple rows, one per ticker per day.
That’s what we mean by “stacking.”