In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import time

def get_sp500_tickers():
    """
    Retrieves current S&P 500 components from Wikipedia
    
    Returns:
        list: List of S&P 500 ticker symbols
    """
    try:
        # URL for the S&P 500 components table on Wikipedia
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
        
        # Read the tables from the Wikipedia page
        tables = pd.read_html(url)
        
        # The first table contains the S&P 500 components
        sp500_table = tables[0]
        
        # Extract the 'Symbol' column as a list
        tickers = sp500_table['Symbol'].tolist()
        
        # Clean up the tickers (replace dots with hyphens for BRK.B, etc.)
        tickers = [ticker.replace('.', '-') for ticker in tickers]
        
        return tickers
    
    except Exception as e:
        print(f"Error retrieving S&P 500 tickers: {e}")
        return []

def get_daily_data(tickers, start_date='2020-01-01', end_date=None, batch_size=100):
    """
    Retrieves daily price data for a list of tickers in batches to avoid API limitations
    
    Args:
        tickers (list): List of ticker symbols
        start_date (str): Start date for data retrieval in YYYY-MM-DD format
        end_date (str): End date for data retrieval in YYYY-MM-DD format
        batch_size (int): Number of tickers to download at once
    
    Returns:
        pd.DataFrame: DataFrame with daily closing prices for each ticker
    """
    if end_date is None:
        end_date = datetime.today().strftime('%Y-%m-%d')
    
    all_data = pd.DataFrame()
    
    # Process tickers in batches to avoid API limitations
    for i in range(0, len(tickers), batch_size):
        batch_tickers = tickers[i:i+batch_size]
        print(f"Downloading data for tickers {i+1} to {min(i+batch_size, len(tickers))}...")
        
        try:
            # Download daily data for the batch
            batch_data = yf.download(batch_tickers, start=start_date, end=end_date, progress=False)
            
            # If we have more than one ticker, we'll have a MultiIndex DataFrame
            if len(batch_tickers) > 1:
                batch_close = batch_data['Close']
            else:
                # For a single ticker, we need to handle differently
                batch_close = batch_data['Close'].to_frame(name=batch_tickers[0])
            
            # For the first batch, set this as our dataframe
            if all_data.empty:
                all_data = batch_close
            else:
                # For subsequent batches, join with existing data
                all_data = all_data.join(batch_close, how='outer')
            
            # Add a small delay to avoid hitting API limits
            time.sleep(1)
            
        except Exception as e:
            print(f"Error downloading data for batch starting at index {i}: {e}")
    
    return all_data

def calculate_daily_advances_declines(daily_data):
    """
    Calculate the number of advancing and declining stocks each day
    
    Args:
        daily_data (pd.DataFrame): DataFrame with daily closing prices for each ticker
    
    Returns:
        pd.DataFrame: DataFrame with daily counts of advances, declines, and unchanged
    """
    # Calculate daily price changes
    daily_changes = daily_data.pct_change()
    
    # Count advances, declines, and unchanged for each day
    advances = (daily_changes > 0).sum(axis=1)
    declines = (daily_changes < 0).sum(axis=1)
    unchanged = (daily_changes == 0).sum(axis=1)
    
    # Create DataFrame with results
    breadth_data = pd.DataFrame({
        'Advances': advances,
        'Declines': declines,
        'Unchanged': unchanged,
        'Total': advances + declines + unchanged,
        'Advance_Decline_Diff': advances - declines
    })
    
    return breadth_data

# Example usage:
if __name__ == "__main__":
    # Get S&P 500 tickers
    sp500_tickers = get_sp500_tickers()
    print(f"Retrieved {len(sp500_tickers)} S&P 500 tickers.")
    
    # Get daily data for the last 90 days (adjust timeframe as needed)
    end_date = datetime.today().strftime('%Y-%m-%d')
    start_date = (datetime.today() - timedelta(days=90)).strftime('%Y-%m-%d')
    
    daily_data = get_daily_data(sp500_tickers, start_date, end_date)
    print(f"Retrieved daily data from {start_date} to {end_date}.")
    print(f"Data shape: {daily_data.shape}")
    
    # Calculate daily advances and declines
    breadth_data = calculate_daily_advances_declines(daily_data)
    print("Calculated daily advances and declines.")
    print(breadth_data.head())
    
    # Save the data to CSV files for future use
    daily_data.to_csv('sp500_daily_prices.csv')
    breadth_data.to_csv('sp500_breadth_data.csv')
    print("Data saved to CSV files")

Retrieved 503 S&P 500 tickers.
Downloading data for tickers 1 to 100...
YF.download() has changed argument auto_adjust default to True
Downloading data for tickers 101 to 200...
Downloading data for tickers 201 to 300...
Downloading data for tickers 301 to 400...
Downloading data for tickers 401 to 500...
Downloading data for tickers 501 to 503...
Retrieved daily data from 2025-01-22 to 2025-04-22.
Data shape: (62, 503)
Calculated daily advances and declines.
            Advances  Declines  Unchanged  Total  Advance_Decline_Diff
Date                                                                  
2025-01-22         0         0          0      0                     0
2025-01-23       326       173          4    503                   153
2025-01-24       251       246          6    503                     5
2025-01-27       348       155          0    503                   193
2025-01-28       154       346          3    503                  -192
Data saved to CSV files
