In [1]:
# Import necessary libraries
import yfinance as yf  # yfinance library to fetch stock data from Yahoo Finance
import pandas as pd   # Pandas for data manipulation (e.g., DataFrames)
from datetime import datetime, timedelta  # For handling dates (e.g., calculating time ranges)
from dotenv import load_dotenv  # To load environment variables from .env
import argparse, os, sys, json, hashlib # For file path operations (e.g., creating directories)
from pathlib import Path

In [4]:
# Load environment variables from .env file
load_dotenv()  # This function reads .env and makes vars available via os.getenv()

# Get variables from .env (with defaults if not set)
data_path = os.getenv('DATA_PATH', 'data/raw/')  # Path to save data; default if not in .env
stock_symbol = os.getenv('STOCK_SYMBOL', 'AAPL')  # Stock ticker; default to AAPL

# Create the data/raw/ directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)  # makedirs creates folders recursively; exist_ok prevents errors if it exists

In [5]:
top_stocks = ['AAPL', 'GOOGL', 'TSLA', 'MSFT', 'AMZN', 'NVDA', 'META', 'BRK-B', 'JPM', 'V']  # top 10 stocks

In [8]:
# Define time range for historical data (e.g., last 5 years)
end_date = datetime.now().date()  # Current date as end
start_date = end_date - timedelta(days=5*365)  # 5 years ago; timedelta subtracts days
print("CHECK: end_date ", end_date)
print("CHECK: start_date ", start_date)

CHECK: end_date  2025-08-24
CHECK: start_date  2020-08-25


In [14]:
historical_data_multi = yf.download(tickers=top_stocks, start=start_date, end=end_date)  # Multi-index DF

# Validation: Check if data is not empty
if historical_data_multi.empty:  # .empty is a Pandas attribute to check if DataFrame has no rows
    raise ValueError(f"No historical data found. Check symbol or dates.")  # Raise error to stop script

    
print("CHECK: historical_data_multi", historical_data_multi)

  historical_data_multi = yf.download(tickers=top_stocks, start=start_date, end=end_date)  # Multi-index DF
[*********************100%***********************]  10 of 10 completed

CHECK: historical_data_multi Price            Close                                                  \
Ticker            AAPL        AMZN       BRK-B       GOOGL         JPM   
Date                                                                     
2020-08-25  121.419502  167.324493  213.350006   79.813835   87.880180   
2020-08-26  123.070694  172.092499  214.660004   81.716415   86.647224   
2020-08-27  121.599457  170.000000  216.869995   80.940575   89.497871   
2020-08-28  121.402473  170.089996  218.550003   81.482819   89.865128   
2020-08-31  125.519501  172.548004  218.039993   80.990776   87.609093   
...                ...         ...         ...         ...         ...   
2025-08-18  230.889999  231.490005  478.519989  203.500000  291.529999   
2025-08-19  230.559998  228.009995  485.309998  201.570007  290.660004   
2025-08-20  226.009995  223.809998  488.679993  199.320007  292.239990   
2025-08-21  224.899994  221.949997  488.589996  199.750000  291.470001   
2025-08-2




In [20]:
# UNDERSATNDING THE DATA
print("Column Names:", historical_data_multi.columns)
historical_data_multi.info()

print("\nFirst 5 rows of the DataFrame:")
print(historical_data_multi.head())

'''
WE HAVE MULTIINDEX DATA

Understanding MultiIndex: 

MultiIndex columns have two distinct levels, each with a name:

Level 1: 'Price'
This is the top-level header that groups the data by the type of financial metric. For each stock, you have a set of columns for 'Close', 'High', 'Low', 'Open', and 'Volume'. This level allows you to easily select all columns of a specific type across all stocks.

Level 2: 'Ticker'
This is the second-level header that groups the data by the specific stock ticker symbol. It organizes all the price information for each company. This level allows you to easily select all data for a specific stock (e.g., just 'AAPL').

By using a MultiIndex, pandas avoids creating redundant column names like AAPL_Close, AMZN_Close, etc. Instead, it creates a cleaner, more organized structure.

********************************************

How to Use MultiIndex Columns

This structure makes it very easy to select specific subsets of your data.
For example, to get all the 'Close' prices for every stock, you can use the .loc accessor with a tuple:

Python
# Select all the 'Close' price columns
close_prices = historical_data_multi.loc[:, 'Close']
Or, to get all the data (all prices and volume) for a single stock like 'AAPL', you can use the same technique:

Python
# Select all columns for a specific ticker
aapl_data = historical_data_multi.loc[:, 'AAPL']
'''

Column Names: MultiIndex([( 'Close',  'AAPL'),
            ( 'Close',  'AMZN'),
            ( 'Close', 'BRK-B'),
            ( 'Close', 'GOOGL'),
            ( 'Close',   'JPM'),
            ( 'Close',  'META'),
            ( 'Close',  'MSFT'),
            ( 'Close',  'NVDA'),
            ( 'Close',  'TSLA'),
            ( 'Close',     'V'),
            (  'High',  'AAPL'),
            (  'High',  'AMZN'),
            (  'High', 'BRK-B'),
            (  'High', 'GOOGL'),
            (  'High',   'JPM'),
            (  'High',  'META'),
            (  'High',  'MSFT'),
            (  'High',  'NVDA'),
            (  'High',  'TSLA'),
            (  'High',     'V'),
            (   'Low',  'AAPL'),
            (   'Low',  'AMZN'),
            (   'Low', 'BRK-B'),
            (   'Low', 'GOOGL'),
            (   'Low',   'JPM'),
            (   'Low',  'META'),
            (   'Low',  'MSFT'),
            (   'Low',  'NVDA'),
            (   'Low',  'TSLA'),
            (   'Low',     'V

"\nWE HAVE MULTIINDEX DATA\n\nUnderstanding MultiIndex: \n\nMultiIndex columns have two distinct levels, each with a name:\n\nLevel 1: 'Price'\nThis is the top-level header that groups the data by the type of financial metric. For each stock, you have a set of columns for 'Close', 'High', 'Low', 'Open', and 'Volume'. This level allows you to easily select all columns of a specific type across all stocks.\n\nLevel 2: 'Ticker'\nThis is the second-level header that groups the data by the specific stock ticker symbol. It organizes all the price information for each company. This level allows you to easily select all data for a specific stock (e.g., just 'AAPL').\n\nBy using a MultiIndex, pandas avoids creating redundant column names like AAPL_Close, AMZN_Close, etc. Instead, it creates a cleaner, more organized structure.\n\n********************************************\n\nHow to Use MultiIndex Columns\n\nThis structure makes it very easy to select specific subsets of your data.\nFor exampl

In [29]:
# STORING RAW DATA 
# Prepare filenames with timestamps for reproducibility (e.g., aapl_historical_2025-08-24.csv)

# SAVING ALL THE DATA IN ONE FILE 
# To save ALL in ONE file: Stack into single DF with 'Symbol' column
# (historical_data_multi is multi-index: columns like ('Open', 'AAPL'), so unstack)
stacked_df = historical_data_multi.stack(level=1).reset_index().rename(columns={'level_1': 'Symbol'})  # Stack tickers to rows
'''
stacked_df = historical_data_multi.stack(level=1).reset_index().rename(columns={'level_1': 'Symbol'})

This single line of code is a powerful chain of three pandas operations. Here's what each part does:

historical_data_multi.stack(level=1): This is the core operation. The .stack() method pivots the DataFrame from a wide format (where each ticker is a separate column)
to a long format (where tickers become a new row-level index). By specifying level=1, you're telling pandas to move the second level of the column MultiIndex (which is your 
Ticker level) into a new row index. After this step, your DataFrame will have a two-level row index (Date and Symbol) and a single-level column index (Open, Close, etc.).

Why do this? This is a standard data manipulation technique. A "long" format is often required for data visualization libraries (like seaborn or matplotlib) and for 
machine learning models, as it makes each row a unique observation (e.g., the data for 'AAPL' on a specific date).

.reset_index(): After stacking, your Date and Symbol are now a part of the index, not regular columns. The .reset_index() method converts these index levels back into
standard columns, giving them default names like level_0 and level_1.

Why do this? You need Date and Symbol to be regular columns so they can be easily filtered, sorted, or used as features in a model later.

.rename(columns={'level_1': 'Symbol'}): This is the final cleanup step. As mentioned above, .reset_index() gives the new columns generic names. This method renames 
the column that holds your stock tickers ('level_1') to a more meaningful name: 'Symbol'. The other index ('level_0') will be automatically named 'Date'.

Why do this? This improves the readability and usability of your DataFrame. Using descriptive column names is a best practice for clean code and effective data analysis.

'''
print("CHECK: Stacked Data \n", stacked_df)
stacked_df = stacked_df.rename(columns={'level_0': 'Date', 'level_1': 'Symbol'})
print(stacked_df.columns)
# dataframe operations: df.columns, df.to_csv, df.info(), df.head(_number_oprtional_)
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')  # Format current time as string
combined_filename = os.path.join(data_path, f"top10_historical_{timestamp}.csv")
stacked_df.to_csv(combined_filename, index=False)  # Save without index

CHECK: Stacked Data 
 Price       Date Ticker       Close        High         Low        Open  \
0     2020-08-25   AAPL  121.419502  121.764820  119.695361  121.295486   
1     2020-08-25   AMZN  167.324493  167.869995  163.350006  164.749496   
2     2020-08-25  BRK-B  213.350006  214.000000  212.850006  213.559998   
3     2020-08-25  GOOGL   79.813835   79.964429   78.423666   78.534005   
4     2020-08-25    JPM   87.880180   89.174339   87.268082   88.553488   
...          ...    ...         ...         ...         ...         ...   
12545 2025-08-22   META  754.789978  756.900024  734.390015  739.229980   
12546 2025-08-22   MSFT  507.230011  510.730011  502.410004  504.250000   
12547 2025-08-22   NVDA  177.990005  178.589996  171.199997  172.610001   
12548 2025-08-22   TSLA  340.010010  340.250000  319.690002  321.660004   
12549 2025-08-22      V  350.040009  351.200012  345.220001  345.220001   

Price     Volume  
0      211495600  
1       79856000  
2        3675900  
3

  stacked_df = historical_data_multi.stack(level=1).reset_index().rename(columns={'level_1': 'Symbol'})  # Stack tickers to rows
