In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page to scrape
url = 'https://en.wikipedia.org/wiki/List_of_S&P_500_companies'

# Send a GET request to the URL
response = requests.get(url)

# If the request was successful
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the <tbody> element in the HTML where the data resides
    table_body = soup.find('tbody')
    
    # Initialize a list to store your data
    data = []
    
    # Find all <tr> elements
    rows = table_body.find_all('tr')
    for row in rows:
        # Find all <td> elements in this row
        cols = row.find_all('td')
        
        # Get the text from all the <td> elements and add to the list
        data.append([ele.text.strip() for ele in cols])
    
    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=['Ticker', 'Company', 'Sector', 'Sub-Industry', 'Headquarters', 'Date First Added', 'CIK', 'Founded'])

    # Save the DataFrame to a CSV file
    df.to_csv('sp500_companies.csv', index=False)

    print('Data scraped and saved to sp500_companies.csv')
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)


Data scraped and saved to sp500_companies.csv


In [65]:
#for ticker in df['Ticker']:
#    print(ticker)


In [47]:
sp500_tickers = ['MMM','AOS','ABT','ABBV','ACN','ADM','ADBE','ADP','AES','AFL','A','ABNB','APD','AKAM','ALK','ALB','ARE','ALGN','ALLE','LNT','ALL','GOOGL','GOOG','MO','AMZN','AMCR','AMD','AEE','AAL','AEP','AXP','AIG','AMT','AWK','AMP','AME','AMGN','APH','ADI','ANSS','AON','APA','AAPL','AMAT','APTV','ACGL','ANET','AJG','AIZ','T','ATO','ADSK','AZO','AVB','AVY','AXON','BKR','BALL','BAC','BBWI','BAX','BDX','WRB','BRK.B','BBY','BIO','TECH','BIIB','BLK','BX','BK','BA','BKNG','BWA','BXP','BSX','BMY','AVGO','BR','BRO','BF.B','BG','CHRW','CDNS','CZR','CPT','CPB','COF','CAH','KMX','CCL','CARR','CTLT','CAT','CBOE','CBRE','CDW','CE','COR','CNC','CNP','CDAY','CF','CRL','SCHW','CHTR','CVX','CMG','CB','CHD','CI','CINF','CTAS','CSCO','C','CFG','CLX','CME','CMS','KO','CTSH','CL','CMCSA','CMA','CAG','COP','ED','STZ','CEG','COO','CPRT','GLW','CTVA','CSGP','COST','CTRA','CCI','CSX','CMI','CVS','DHI','DHR','DRI','DVA','DE','DAL','XRAY','DVN','DXCM','FANG','DLR','DFS','DIS','DG','DLTR','D','DPZ','DOV','DOW','DTE','DUK','DD','EMN','ETN','EBAY','ECL','EIX','EW','EA','ELV','LLY','EMR','ENPH','ETR','EOG','EPAM','EQT','EFX','EQIX','EQR','ESS','EL','ETSY','EG','EVRG','ES','EXC','EXPE','EXPD','EXR','XOM','FFIV','FDS','FICO','FAST','FRT','FDX','FITB','FSLR','FE','FIS','FI','FLT','FMC','F','FTNT','FTV','FOXA','FOX','BEN','FCX','GRMN','IT','GEHC','GEN','GNRC','GD','GE','GIS','GM','GPC','GILD','GL','GPN','GS','HAL','HIG','HAS','HCA','PEAK','HSIC','HSY','HES','HPE','HLT','HOLX','HD','HON','HRL','HST','HWM','HPQ','HUBB','HUM','HBAN','HII','IBM','IEX','IDXX','ITW','ILMN','INCY','IR','PODD','INTC','ICE','IFF','IP','IPG','INTU','ISRG','IVZ','INVH','IQV','IRM','JBHT','JKHY','J','JNJ','JCI','JPM','JNPR','K','KVUE','KDP','KEY','KEYS','KMB','KIM','KMI','KLAC','KHC','KR','LHX','LH','LRCX','LW','LVS','LDOS','LEN','LIN','LYV','LKQ','LMT','L','LOW','LULU','LYB','MTB','MRO','MPC','MKTX','MAR','MMC','MLM','MAS','MA','MTCH','MKC','MCD','MCK','MDT','MRK','META','MET','MTD','MGM','MCHP','MU','MSFT','MAA','MRNA','MHK','MOH','TAP','MDLZ','MPWR','MNST','MCO','MS','MOS','MSI','MSCI','NDAQ','NTAP','NFLX','NEM','NWSA','NWS','NEE','NKE','NI','NDSN','NSC','NTRS','NOC','NCLH','NRG','NUE','NVDA','NVR','NXPI','ORLY','OXY','ODFL','OMC','ON','OKE','ORCL','OTIS','PCAR','PKG','PANW','PARA','PH','PAYX','PAYC','PYPL','PNR','PEP','PFE','PCG','PM','PSX','PNW','PXD','PNC','POOL','PPG','PPL','PFG','PG','PGR','PLD','PRU','PEG','PTC','PSA','PHM','QRVO','PWR','QCOM','DGX','RL','RJF','RTX','O','REG','REGN','RF','RSG','RMD','RVTY','RHI','ROK','ROL','ROP','ROST','RCL','SPGI','CRM','SBAC','SLB','STX','SEE','SRE','NOW','SHW','SPG','SWKS','SJM','SNA','SEDG','SO','LUV','SWK','SBUX','STT','STLD','STE','SYK','SYF','SNPS','SYY','TMUS','TROW','TTWO','TPR','TRGP','TGT','TEL','TDY','TFX','TER','TSLA','TXN','TXT','TMO','TJX','TSCO','TT','TDG','TRV','TRMB','TFC','TYL','TSN','USB','UDR','ULTA','UNP','UAL','UPS','URI','UNH','UHS','VLO','VTR','VLTO','VRSN','VRSK','VZ','VRTX','VFC','VTRS','VICI','V','VMC','WAB','WBA','WMT','WBD','WM','WAT','WEC','WFC','WELL','WST','WDC','WRK','WY','WHR','WMB','WTW','GWW','WYNN','XEL','XYL','YUM','ZBRA','ZBH','ZION','ZT']

In [66]:
#sp500_tickers = ['UAL','UPS','URI','UNH','UHS','VLO','VTR','VLTO','VRSN','VRSK','VZ','VRTX','VFC','VTRS','VICI','V','VMC','WAB','WBA','WMT','WBD','WM','WAT','WEC','WFC','WELL','WST','WDC','WRK','WY','WHR','WMB','WTW','GWW','WYNN','XEL','XYL','YUM','ZBRA','ZBH','ZION','ZT']

In [67]:
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
import logging
from selenium.common.exceptions import TimeoutException

import time
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

OUTPUT_FILE_TEMPLATE = "aapl_historical_data_{time_period}_{show}_{frequency}.csv"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
BASE_URL_TEMPLATE = "https://finance.yahoo.com/quote/{}/history"  # URL template to be formatted with ticker symbol
OUTPUT_FILE = "aapl_historical_data.csv"
DATE_RANGE_ID = 'date-range-selector'  # Update this with the actual ID or selector for the date range element
FREQUENCY_SELECTOR = 'frequency-selector'  # Update this with the actual ID or selector for the frequency dropdown

def setup_driver():
    options = FirefoxOptions()
    driver = webdriver.Firefox(options=options)
    return driver

# Use logging in your functions
def navigate_to_page(driver, url):
    try:
        logging.info(f"Navigating to {url}")
        driver.get(url)
    except Exception as e:
        logging.error("An error occurred while trying to navigate to the page.")
        logging.error(e)

def select_time_period(driver, period):
    try:
        logging.info(f"Selecting time period: {period}")
        
        # Click the date range dropdown to reveal the options
        date_range_dropdown = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "svg[data-icon='CoreArrowDown']"))
        )
        logging.info("Date range dropdown found and clickable.")
        
        date_range_dropdown.click()
        logging.info("Date range dropdown clicked.")

        # Now that the dropdown is open, click the 'Max' option
        max_option = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@data-value='MAX']"))
        )
        logging.info("Max option found and clickable.")
        
        # Here you can add code to verify if the correct option is being selected
        if max_option.get_attribute('data-value') == 'MAX':
            logging.info("Correct 'Max' option is present.")
        else:
            logging.warning("The 'Max' option does not have the expected 'data-value' attribute.")

        max_option.click()
        logging.info("Max option clicked.")

        # After clicking, you can also verify the current selected date range
        # This assumes that there's an element that reflects the selected date range.
        # You may need to update the selector as per the actual website structure.

    except TimeoutException as e:
        logging.error(f"Timed out waiting for time period option: {period}")
        logging.error(e)
    except NoSuchElementException as e:
        logging.error(f"Could not find the time period option: {period}")
        logging.error(e)
    except Exception as e:
        logging.error(f"An error occurred while selecting time period: {period}")
        logging.error(e)

def select_frequency(driver, frequency):
    try:
        logging.info(f"Selecting frequency: {frequency}")
        # Add additional logging if necessary
        # For example, log the current URL or take a screenshot
        current_url = driver.current_url
        logging.info(f"Current URL: {current_url}")

        # Your code to select frequency here...

        # After selecting, add an explicit wait for the page to load or for the element to be clickable
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'the-id-of-the-frequency-dropdown'))
        )

    except TimeoutException as e:
        logging.error(f"Timed out waiting for frequency option: {frequency}")
        logging.error(e)
    except NoSuchElementException as e:
        logging.error(f"Could not find the frequency option: {frequency}")
        logging.error(e)
    except Exception as e:
        logging.error(f"An error occurred while selecting frequency: {frequency}")
        logging.error(e)
        # Optional: Take a screenshot on error
        driver.get_screenshot_as_file("error_screenshot.png")


import urllib.request

def download_csv(download_url, filename, download_directory='./yahoo_/yahoo_sp500/'):
    # Ensure the download directory exists
    if not os.path.exists(download_directory):
        os.makedirs(download_directory)
    # Define the full file path
    filepath = os.path.join(download_directory, filename)
    # Use urllib to download the file
    urllib.request.urlretrieve(download_url, filepath)
    logging.info(f"File downloaded: {filepath}")

def main():
    # Other setup code...

    with setup_driver() as driver:
        for ticker in sp500_tickers:
            try:
                # Prepare the CSV download URL
                csv_download_url = f"https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1=-252374400&period2=1699142400&interval=1d&events=history&includeAdjustedClose=true"
                filename = f"{ticker}.csv"
                # Download CSV file
                download_csv(csv_download_url, filename)
            
                # Wait before making the next request
                time.sleep(2)
            except Exception as e:
                logging.error(f"An error occurred while processing {ticker}.")
                logging.error(e)
                # Optionally, add ticker to a list or file for failed attempts

#if __name__ == "__main__":
#    main()
