In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page to scrape
url = 'https://en.wikipedia.org/wiki/List_of_S&P_500_companies'

# Send a GET request to the URL
response = requests.get(url)

# If the request was successful
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the <tbody> element in the HTML where the data resides
    table_body = soup.find('tbody')
    
    # Initialize a list to store your data
    data = []
    
    # Find all <tr> elements
    rows = table_body.find_all('tr')
    for row in rows:
        # Find all <td> elements in this row
        cols = row.find_all('td')
        
        # Get the text from all the <td> elements and add to the list
        data.append([ele.text.strip() for ele in cols])
    
    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=['Ticker', 'Company', 'Sector', 'Sub-Industry', 'Headquarters', 'Date First Added', 'CIK', 'Founded'])

    # Save the DataFrame to a CSV file
    df.to_csv('sp500_companies.csv', index=False)

    print('Data scraped and saved to sp500_companies.csv')
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)


2024-05-04 17:12:25,229 - INFO - NumExpr defaulting to 8 threads.


Data scraped and saved to sp500_companies.csv


In [65]:
#for ticker in df['Ticker']:
#    print(ticker)


In [15]:
df

Unnamed: 0,Ticker,Company,Sector,Sub-Industry,Headquarters,Date First Added,CIK,Founded
0,,,,,,,,
1,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,0000066740,1902
2,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,0000091142,1916
3,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,0000001800,1888
4,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,0001551152,2013 (1888)
...,...,...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components,"White Plains, New York",2011-11-01,0001524472,2011
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,0001041061,1997
500,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,0000877212,1969
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,0001136869,1927


In [1]:
sp500_tickers = ['MMM','AOS','ABT','ABBV','ACN','ADM','ADBE','ADP','AES','AFL','A','ABNB','APD','AKAM','ALK','ALB','ARE','ALGN','ALLE','LNT','ALL','GOOGL','GOOG','MO','AMZN','AMCR','AMD','AEE','AAL','AEP','AXP','AIG','AMT','AWK','AMP','AME','AMGN','APH','ADI','ANSS','AON','APA','AAPL','AMAT','APTV','ACGL','ANET','AJG','AIZ','T','ATO','ADSK','AZO','AVB','AVY','AXON','BKR','BALL','BAC','BBWI','BAX','BDX','WRB','BRK.B','BBY','BIO','TECH','BIIB','BLK','BX','BK','BA','BKNG','BWA','BXP','BSX','BMY','AVGO','BR','BRO','BF.B','BG','CHRW','CDNS','CZR','CPT','CPB','COF','CAH','KMX','CCL','CARR','CTLT','CAT','CBOE','CBRE','CDW','CE','COR','CNC','CNP','CDAY','CF','CRL','SCHW','CHTR','CVX','CMG','CB','CHD','CI','CINF','CTAS','CSCO','C','CFG','CLX','CME','CMS','KO','CTSH','CL','CMCSA','CMA','CAG','COP','ED','STZ','CEG','COO','CPRT','GLW','CTVA','CSGP','COST','CTRA','CCI','CSX','CMI','CVS','DHI','DHR','DRI','DVA','DE','DAL','XRAY','DVN','DXCM','FANG','DLR','DFS','DIS','DG','DLTR','D','DPZ','DOV','DOW','DTE','DUK','DD','EMN','ETN','EBAY','ECL','EIX','EW','EA','ELV','LLY','EMR','ENPH','ETR','EOG','EPAM','EQT','EFX','EQIX','EQR','ESS','EL','ETSY','EG','EVRG','ES','EXC','EXPE','EXPD','EXR','XOM','FFIV','FDS','FICO','FAST','FRT','FDX','FITB','FSLR','FE','FIS','FI','FLT','FMC','F','FTNT','FTV','FOXA','FOX','BEN','FCX','GRMN','IT','GEHC','GEN','GNRC','GD','GE','GIS','GM','GPC','GILD','GL','GPN','GS','HAL','HIG','HAS','HCA','PEAK','HSIC','HSY','HES','HPE','HLT','HOLX','HD','HON','HRL','HST','HWM','HPQ','HUBB','HUM','HBAN','HII','IBM','IEX','IDXX','ITW','ILMN','INCY','IR','PODD','INTC','ICE','IFF','IP','IPG','INTU','ISRG','IVZ','INVH','IQV','IRM','JBHT','JKHY','J','JNJ','JCI','JPM','JNPR','K','KVUE','KDP','KEY','KEYS','KMB','KIM','KMI','KLAC','KHC','KR','LHX','LH','LRCX','LW','LVS','LDOS','LEN','LIN','LYV','LKQ','LMT','L','LOW','LULU','LYB','MTB','MRO','MPC','MKTX','MAR','MMC','MLM','MAS','MA','MTCH','MKC','MCD','MCK','MDT','MRK','META','MET','MTD','MGM','MCHP','MU','MSFT','MAA','MRNA','MHK','MOH','TAP','MDLZ','MPWR','MNST','MCO','MS','MOS','MSI','MSCI','NDAQ','NTAP','NFLX','NEM','NWSA','NWS','NEE','NKE','NI','NDSN','NSC','NTRS','NOC','NCLH','NRG','NUE','NVDA','NVR','NXPI','ORLY','OXY','ODFL','OMC','ON','OKE','ORCL','OTIS','PCAR','PKG','PANW','PARA','PH','PAYX','PAYC','PYPL','PNR','PEP','PFE','PCG','PM','PSX','PNW','PXD','PNC','POOL','PPG','PPL','PFG','PG','PGR','PLD','PRU','PEG','PTC','PSA','PHM','QRVO','PWR','QCOM','DGX','RL','RJF','RTX','O','REG','REGN','RF','RSG','RMD','RVTY','RHI','ROK','ROL','ROP','ROST','RCL','SPGI','CRM','SBAC','SLB','STX','SEE','SRE','NOW','SHW','SPG','SWKS','SJM','SNA','SEDG','SO','LUV','SWK','SBUX','STT','STLD','STE','SYK','SYF','SNPS','SYY','TMUS','TROW','TTWO','TPR','TRGP','TGT','TEL','TDY','TFX','TER','TSLA','TXN','TXT','TMO','TJX','TSCO','TT','TDG','TRV','TRMB','TFC','TYL','TSN','USB','UDR','ULTA','UNP','UAL','UPS','URI','UNH','UHS','VLO','VTR','VLTO','VRSN','VRSK','VZ','VRTX','VFC','VTRS','VICI','V','VMC','WAB','WBA','WMT','WBD','WM','WAT','WEC','WFC','WELL','WST','WDC','WRK','WY','WHR','WMB','WTW','GWW','WYNN','XEL','XYL','YUM','ZBRA','ZBH','ZION','ZT']

In [66]:
#sp500_tickers = ['UAL','UPS','URI','UNH','UHS','VLO','VTR','VLTO','VRSN','VRSK','VZ','VRTX','VFC','VTRS','VICI','V','VMC','WAB','WBA','WMT','WBD','WM','WAT','WEC','WFC','WELL','WST','WDC','WRK','WY','WHR','WMB','WTW','GWW','WYNN','XEL','XYL','YUM','ZBRA','ZBH','ZION','ZT']

In [6]:
pip install selenium --upgrade


Note: you may need to restart the kernel to use updated packages.


In [8]:
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from datetime import datetime, timedelta
import logging
from selenium.common.exceptions import TimeoutException

import time
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

OUTPUT_FILE_TEMPLATE = "aapl_historical_data_{time_period}_{show}_{frequency}.csv"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
BASE_URL_TEMPLATE = "https://finance.yahoo.com/quote/{}/history"  # URL template to be formatted with ticker symbol
OUTPUT_FILE = "aapl_historical_data.csv"
DATE_RANGE_ID = 'date-range-selector'  # Update this with the actual ID or selector for the date range element
FREQUENCY_SELECTOR = 'frequency-selector'  # Update this with the actual ID or selector for the frequency dropdown

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
CSV_FILE_PATH = "./sp500_companies.csv"
DOWNLOAD_DIRECTORY = './yahoo_/yahoo_sp500/'

def setup_driver():
    options = FirefoxOptions()
    driver = webdriver.Firefox(options=options)
    return driver

def is_file_outdated(file_path):
    """Check if the existing file is outdated based on its date in the filename."""
    if not os.path.exists(file_path):
        return True  # File does not exist, so it is considered outdated

    # Extract the date part from the filename, assuming it's formatted as 'Ticker_YYYYMMDD.csv'
    filename = os.path.basename(file_path)
    try:
        # Attempt to parse the date from the filename, considering filename format 'Ticker_YYYYMMDD.csv'
        date_str = filename.split('_')[-1].split('.')[0]  # Split on underscore, then remove '.csv'
        file_date = datetime.strptime(date_str, '%Y%m%d')
        
        # Compare file date to today's date
        today = datetime.now()
        return file_date.date() < today.date()
    except ValueError as e:
        # Log an error if date parsing fails
        logging.error(f"Error parsing date from filename {filename}: {str(e)}")
        return False  # Assume file is not outdated if we cannot parse the date

def is_today_file_exists(filepath):
    """Check if today's file exists."""
    return os.path.exists(filepath)

def remove_old_file(download_directory, ticker):
    """Remove the file from the previous day if it exists."""
    yesterday = datetime.now() - timedelta(days=1)
    old_filename = f"{ticker}_{yesterday.strftime('%Y%m%d')}.csv"
    old_filepath = os.path.join(download_directory, old_filename)
    if os.path.exists(old_filepath):
        os.remove(old_filepath)
        logging.info(f"Removed old file: {old_filepath}")

def check_and_download_csv(ticker, download_directory):
    today_str = datetime.now().strftime('%Y%m%d')
    filename = f"{ticker}_{today_str}.csv"
    file_path = os.path.join(download_directory, filename)
    
    if not is_today_file_exists(file_path):
        logging.info(f"Data for {ticker} is outdated or missing. Downloading the latest data.")
        csv_download_url = f"https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1=-252374400&period2=1699142400&interval=1d&events=history&includeAdjustedClose=true"
        download_csv(csv_download_url, filename, download_directory)
        remove_old_file(download_directory, ticker)
    else:
        logging.info(f"Data for {ticker} is up-to-date. No download needed.")

def download_file(file_path):
    download_url = "http://example.com/download.csv"  # This URL should be dynamically constructed if needed
    urllib.request.urlretrieve(download_url, file_path)
    logging.info(f"Downloaded {file_path}")

def read_tickers(file_path):
    with open(file_path, mode='r', newline='') as file:
        reader = csv.DictReader(file)
        return [row['Ticker'] for row in reader]  # Ensure column name matches your CSV

# Use logging in your functions
def navigate_to_page(driver, url):
    try:
        logging.info(f"Navigating to {url}")
        driver.get(url)
    except Exception as e:
        logging.error("An error occurred while trying to navigate to the page.")
        logging.error(e)

def select_time_period(driver, period):
    try:
        logging.info(f"Selecting time period: {period}")
        
        # Click the date range dropdown to reveal the options
        date_range_dropdown = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "svg[data-icon='CoreArrowDown']"))
        )
        logging.info("Date range dropdown found and clickable.")
        
        date_range_dropdown.click()
        logging.info("Date range dropdown clicked.")

        # Now that the dropdown is open, click the 'Max' option
        max_option = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@data-value='MAX']"))
        )
        logging.info("Max option found and clickable.")
        
        # Here you can add code to verify if the correct option is being selected
        if max_option.get_attribute('data-value') == 'MAX':
            logging.info("Correct 'Max' option is present.")
        else:
            logging.warning("The 'Max' option does not have the expected 'data-value' attribute.")

        max_option.click()
        logging.info("Max option clicked.")

        # After clicking, you can also verify the current selected date range
        # This assumes that there's an element that reflects the selected date range.
        # You may need to update the selector as per the actual website structure.

    except TimeoutException as e:
        logging.error(f"Timed out waiting for time period option: {period}")
        logging.error(e)
    except NoSuchElementException as e:
        logging.error(f"Could not find the time period option: {period}")
        logging.error(e)
    except Exception as e:
        logging.error(f"An error occurred while selecting time period: {period}")
        logging.error(e)

def select_frequency(driver, frequency):
    try:
        logging.info(f"Selecting frequency: {frequency}")
        # Add additional logging if necessary
        # For example, log the current URL or take a screenshot
        current_url = driver.current_url
        logging.info(f"Current URL: {current_url}")

        # Your code to select frequency here...

        # After selecting, add an explicit wait for the page to load or for the element to be clickable
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'the-id-of-the-frequency-dropdown'))
        )

    except TimeoutException as e:
        logging.error(f"Timed out waiting for frequency option: {frequency}")
        logging.error(e)
    except NoSuchElementException as e:
        logging.error(f"Could not find the frequency option: {frequency}")
        logging.error(e)
    except Exception as e:
        logging.error(f"An error occurred while selecting frequency: {frequency}")
        logging.error(e)
        # Optional: Take a screenshot on error
        driver.get_screenshot_as_file("error_screenshot.png")


import urllib.request

def download_csv(download_url, filename, download_directory):
    # Ensure the download directory exists
    if not os.path.exists(download_directory):
        os.makedirs(download_directory)
    # Define the full file path
    filepath = os.path.join(download_directory, filename)
    # Use urllib to download the file
    urllib.request.urlretrieve(download_url, filepath)
    logging.info(f"File downloaded: {filepath}")

def read_tickers_from_csv(csv_file, column_name):
    tickers = []
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if column_name in row:
                tickers.append(row[column_name])
    return tickers

# Function to check if the file was downloaded more than 24 hours ago
def is_file_old(filepath, hours=24):
    if not os.path.exists(filepath):
        return True
    now = datetime.now()
    modified_time = datetime.fromtimestamp(os.path.getmtime(filepath))
    return (now - modified_time) > timedelta(hours=hours)

def main():
    # Constants
    CSV_FILE = "./sp500_companies.csv"
    COLUMN_NAME = "Ticker"
    DOWNLOAD_DIRECTORY = './yahoo_/yahoo_sp500/'

    # Read tickers from CSV
    tickers = read_tickers_from_csv(CSV_FILE, COLUMN_NAME)

    # Iterate over tickers
    for ticker in tickers:
        try:
            # Prepare the CSV download URL and filename to include the current date for comparison
            today_str = datetime.now().strftime('%Y%m%d')
            csv_download_url = f"https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1=-252374400&period2=1699142400&interval=1d&events=history&includeAdjustedClose=true"
            filename = f"{ticker}_{today_str}.csv"
            filepath = os.path.join(DOWNLOAD_DIRECTORY, filename)

            # Check if the file is outdated or does not exist
            if is_file_outdated(filepath):
                logging.info(f"Data for {ticker} is outdated or missing. Downloading the latest data.")
                download_csv(csv_download_url, filename, DOWNLOAD_DIRECTORY)
                logging.info(f"File for {ticker} downloaded successfully.")
            else:
                logging.info(f"Data for {ticker} is up-to-date. No download needed.")
            
            # Wait before making the next request
            time.sleep(2)
        except Exception as e:
            logging.error(f"An error occurred while processing {ticker}: {str(e)}")
            # Optionally, add ticker to a list or file for failed attempts

if __name__ == "__main__":
    main()


2024-05-12 15:48:02,347 - INFO - Data for  is outdated or missing. Downloading the latest data.
2024-05-12 15:48:02,503 - ERROR - An error occurred while processing : HTTP Error 404: Not Found
2024-05-12 15:48:02,507 - INFO - Data for MMM is up-to-date. No download needed.
2024-05-12 15:48:04,510 - INFO - Data for AOS is up-to-date. No download needed.
2024-05-12 15:48:06,513 - INFO - Data for ABT is up-to-date. No download needed.
2024-05-12 15:48:08,520 - INFO - Data for ABBV is up-to-date. No download needed.
2024-05-12 15:48:10,527 - INFO - Data for ACN is up-to-date. No download needed.
2024-05-12 15:48:12,531 - INFO - Data for ADBE is up-to-date. No download needed.
2024-05-12 15:48:14,537 - INFO - Data for AMD is up-to-date. No download needed.
2024-05-12 15:48:16,544 - INFO - Data for AES is up-to-date. No download needed.
2024-05-12 15:48:18,550 - INFO - Data for AFL is up-to-date. No download needed.
2024-05-12 15:48:20,552 - INFO - Data for A is up-to-date. No download neede

2024-05-12 15:51:11,325 - INFO - Data for COR is up-to-date. No download needed.
2024-05-12 15:51:13,330 - INFO - Data for CNC is up-to-date. No download needed.
2024-05-12 15:51:15,335 - INFO - Data for CNP is up-to-date. No download needed.
2024-05-12 15:51:17,342 - INFO - Data for CF is up-to-date. No download needed.
2024-05-12 15:51:19,348 - INFO - Data for CHRW is up-to-date. No download needed.
2024-05-12 15:51:21,355 - INFO - Data for CRL is up-to-date. No download needed.
2024-05-12 15:51:23,358 - INFO - Data for SCHW is up-to-date. No download needed.
2024-05-12 15:51:25,365 - INFO - Data for CHTR is up-to-date. No download needed.
2024-05-12 15:51:27,370 - INFO - Data for CVX is up-to-date. No download needed.
2024-05-12 15:51:29,374 - INFO - Data for CMG is up-to-date. No download needed.
2024-05-12 15:51:31,377 - INFO - Data for CB is up-to-date. No download needed.
2024-05-12 15:51:33,386 - INFO - Data for CHD is up-to-date. No download needed.
2024-05-12 15:51:35,394 - I

2024-05-12 15:54:33,862 - INFO - Data for FITB is up-to-date. No download needed.
2024-05-12 15:54:35,867 - INFO - Data for FSLR is up-to-date. No download needed.
2024-05-12 15:54:37,868 - INFO - Data for FE is up-to-date. No download needed.
2024-05-12 15:54:39,871 - INFO - Data for FI is up-to-date. No download needed.
2024-05-12 15:54:41,873 - INFO - Data for FMC is up-to-date. No download needed.
2024-05-12 15:54:43,879 - INFO - Data for F is up-to-date. No download needed.
2024-05-12 15:54:45,883 - INFO - Data for FTNT is up-to-date. No download needed.
2024-05-12 15:54:47,889 - INFO - Data for FTV is up-to-date. No download needed.
2024-05-12 15:54:49,893 - INFO - Data for FOXA is up-to-date. No download needed.
2024-05-12 15:54:51,896 - INFO - Data for FOX is up-to-date. No download needed.
2024-05-12 15:54:53,899 - INFO - Data for BEN is up-to-date. No download needed.
2024-05-12 15:54:55,905 - INFO - Data for FCX is up-to-date. No download needed.
2024-05-12 15:54:57,911 - IN

2024-05-12 15:57:52,503 - INFO - Data for LYB is up-to-date. No download needed.
2024-05-12 15:57:54,509 - INFO - Data for MTB is up-to-date. No download needed.
2024-05-12 15:57:56,511 - INFO - Data for MRO is up-to-date. No download needed.
2024-05-12 15:57:58,514 - INFO - Data for MPC is up-to-date. No download needed.
2024-05-12 15:58:00,520 - INFO - Data for MKTX is up-to-date. No download needed.
2024-05-12 15:58:02,526 - INFO - Data for MAR is up-to-date. No download needed.
2024-05-12 15:58:04,533 - INFO - Data for MMC is up-to-date. No download needed.
2024-05-12 15:58:06,538 - INFO - Data for MLM is up-to-date. No download needed.
2024-05-12 15:58:08,544 - INFO - Data for MAS is up-to-date. No download needed.
2024-05-12 15:58:10,546 - INFO - Data for MA is up-to-date. No download needed.
2024-05-12 15:58:12,554 - INFO - Data for MTCH is up-to-date. No download needed.
2024-05-12 15:58:14,562 - INFO - Data for MKC is up-to-date. No download needed.
2024-05-12 15:58:16,568 - I

2024-05-12 16:01:15,226 - INFO - Data for RF is up-to-date. No download needed.
2024-05-12 16:01:17,229 - INFO - Data for RSG is up-to-date. No download needed.
2024-05-12 16:01:19,231 - INFO - Data for RMD is up-to-date. No download needed.
2024-05-12 16:01:21,235 - INFO - Data for RVTY is up-to-date. No download needed.
2024-05-12 16:01:23,244 - INFO - Data for RHI is up-to-date. No download needed.
2024-05-12 16:01:25,248 - INFO - Data for ROK is up-to-date. No download needed.
2024-05-12 16:01:27,255 - INFO - Data for ROL is up-to-date. No download needed.
2024-05-12 16:01:29,259 - INFO - Data for ROP is up-to-date. No download needed.
2024-05-12 16:01:31,266 - INFO - Data for ROST is up-to-date. No download needed.
2024-05-12 16:01:33,269 - INFO - Data for RCL is up-to-date. No download needed.
2024-05-12 16:01:35,273 - INFO - Data for SPGI is up-to-date. No download needed.
2024-05-12 16:01:37,276 - INFO - Data for CRM is up-to-date. No download needed.
2024-05-12 16:01:39,282 - 

2024-05-12 16:04:33,922 - INFO - Data for YUM is up-to-date. No download needed.
2024-05-12 16:04:35,927 - INFO - Data for ZBRA is up-to-date. No download needed.
2024-05-12 16:04:37,935 - INFO - Data for ZBH is up-to-date. No download needed.
2024-05-12 16:04:39,940 - INFO - Data for ZTS is up-to-date. No download needed.


In [19]:
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from datetime import datetime, timedelta
import logging
from selenium.common.exceptions import TimeoutException

import time
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException

OUTPUT_FILE_TEMPLATE = "aapl_historical_data_{time_period}_{show}_{frequency}.csv"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
BASE_URL_TEMPLATE = "https://finance.yahoo.com/quote/{}/div"  # URL template to be formatted with ticker symbol
OUTPUT_FILE = "aapl_historical_data.csv"
DATE_RANGE_ID = 'date-range-selector'  # Update this with the actual ID or selector for the date range element
FREQUENCY_SELECTOR = 'frequency-selector'  # Update this with the actual ID or selector for the frequency dropdown

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
CSV_FILE_PATH = "./sp500_companies.csv"
DOWNLOAD_DIRECTORY = './yahoo_/yahoo_sp500_div/'

def setup_driver():
    options = FirefoxOptions()
    driver = webdriver.Firefox(options=options)
    return driver

def check_and_download_csv(file_path, max_age_hours=24):
    if os.path.exists(file_path):
        file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
        if datetime.now() - file_mod_time > timedelta(hours=max_age_hours):
            logging.info(f"File {file_path} is older than {max_age_hours} hours, re-downloading.")
            download_file(file_path)
        else:
            logging.info(f"File {file_path} is up-to-date, no need to re-download.")
    else:
        logging.info(f"File {file_path} does not exist, downloading now.")
        download_file(file_path)

def download_file(file_path):
    download_url = "http://example.com/download.csv"  # This URL should be dynamically constructed if needed
    urllib.request.urlretrieve(download_url, file_path)
    logging.info(f"Downloaded {file_path}")

def read_tickers(file_path):
    with open(file_path, mode='r', newline='') as file:
        reader = csv.DictReader(file)
        return [row['Ticker'] for row in reader]  # Ensure column name matches your CSV

# Use logging in your functions
def navigate_to_page(driver, url):
    try:
        logging.info(f"Navigating to {url}")
        driver.get(url)
    except Exception as e:
        logging.error("An error occurred while trying to navigate to the page.")
        logging.error(e)

def select_time_period(driver, period):
    try:
        logging.info(f"Selecting time period: {period}")
        
        # Click the date range dropdown to reveal the options
        date_range_dropdown = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "svg[data-icon='CoreArrowDown']"))
        )
        logging.info("Date range dropdown found and clickable.")
        
        date_range_dropdown.click()
        logging.info("Date range dropdown clicked.")

        # Now that the dropdown is open, click the 'Max' option
        max_option = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@data-value='MAX']"))
        )
        logging.info("Max option found and clickable.")
        
        # Here you can add code to verify if the correct option is being selected
        if max_option.get_attribute('data-value') == 'MAX':
            logging.info("Correct 'Max' option is present.")
        else:
            logging.warning("The 'Max' option does not have the expected 'data-value' attribute.")

        max_option.click()
        logging.info("Max option clicked.")

        # After clicking, you can also verify the current selected date range
        # This assumes that there's an element that reflects the selected date range.
        # You may need to update the selector as per the actual website structure.

    except TimeoutException as e:
        logging.error(f"Timed out waiting for time period option: {period}")
        logging.error(e)
    except NoSuchElementException as e:
        logging.error(f"Could not find the time period option: {period}")
        logging.error(e)
    except Exception as e:
        logging.error(f"An error occurred while selecting time period: {period}")
        logging.error(e)

def select_frequency(driver, frequency):
    try:
        logging.info(f"Selecting frequency: {frequency}")
        # Add additional logging if necessary
        # For example, log the current URL or take a screenshot
        current_url = driver.current_url
        logging.info(f"Current URL: {current_url}")

        # Your code to select frequency here...

        # After selecting, add an explicit wait for the page to load or for the element to be clickable
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'the-id-of-the-frequency-dropdown'))
        )

    except TimeoutException as e:
        logging.error(f"Timed out waiting for frequency option: {frequency}")
        logging.error(e)
    except NoSuchElementException as e:
        logging.error(f"Could not find the frequency option: {frequency}")
        logging.error(e)
    except Exception as e:
        logging.error(f"An error occurred while selecting frequency: {frequency}")
        logging.error(e)
        # Optional: Take a screenshot on error
        driver.get_screenshot_as_file("error_screenshot.png")


import urllib.request

def download_csv(download_url, filename, download_directory='./yahoo_/yahoo_sp500_div/'):
    # Ensure the download directory exists
    if not os.path.exists(download_directory):
        os.makedirs(download_directory)
    # Define the full file path
    filepath = os.path.join(download_directory, filename)
    # Use urllib to download the file
    urllib.request.urlretrieve(download_url, filepath)
    logging.info(f"File downloaded: {filepath}")

def read_tickers_from_csv(csv_file, column_name):
    tickers = []
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if column_name in row:
                tickers.append(row[column_name])
    return tickers

# Function to check if the file was downloaded more than 24 hours ago
def is_file_old(filepath, hours=24):
    if not os.path.exists(filepath):
        return True
    now = datetime.now()
    modified_time = datetime.fromtimestamp(os.path.getmtime(filepath))
    return (now - modified_time) > timedelta(hours=hours)

def main():
    # Constants
    CSV_FILE = "./sp500_companies.csv"
    COLUMN_NAME = "Ticker"
    DOWNLOAD_DIRECTORY = './yahoo_/yahoo_sp500_div/'

    # Read tickers from CSV
    tickers = read_tickers_from_csv(CSV_FILE, COLUMN_NAME)

    # Iterate over tickers
    for ticker in tickers:
        try:
            # Prepare the CSV download URL
            csv_download_url = f"https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1=-252374400&period2=1699142400&interval=1d&events=div&includeAdjustedClose=true"
            filename = f"{ticker}.csv"
            filepath = os.path.join(DOWNLOAD_DIRECTORY, filename)

            # Check if file needs to be downloaded
            if is_file_old(filepath):
                # Download CSV file
                download_csv(csv_download_url, filename, DOWNLOAD_DIRECTORY)
            
                # Wait before making the next request
                time.sleep(2)
        except Exception as e:
            logging.error(f"An error occurred while processing {ticker}.")
            logging.error(e)
            # Optionally, add ticker to a list or file for failed attempts

if __name__ == "__main__":
    main()

2024-05-12 17:32:34,151 - ERROR - An error occurred while processing .
2024-05-12 17:32:34,152 - ERROR - HTTP Error 404: Not Found
2024-05-12 17:32:34,291 - ERROR - An error occurred while processing BRK.B.
2024-05-12 17:32:34,292 - ERROR - HTTP Error 404: Not Found
2024-05-12 17:32:34,422 - ERROR - An error occurred while processing GEV.
2024-05-12 17:32:34,424 - ERROR - HTTP Error 400: Bad Request
2024-05-12 17:32:34,555 - ERROR - An error occurred while processing SOLV.
2024-05-12 17:32:34,556 - ERROR - HTTP Error 400: Bad Request


In [None]:
Ganpati@27

In [None]:
StockX@27

In [None]:
passtime1995@gmail.com